knitr::opts_chunk$set(echo = TRUE)
INSTALLING LIBRARIES
# install.packages("Hmisc")
# install.packages("pastecs")
# install.packages("ggplot2")
# install.packages("Hmisc")
# install.packages("fastDummies")
# install.packages("lmtest")
# install.packages("lmtest")
# install.packages("caretEnsemble")
# install.packages("Amelia")
# install.packages("GGally")
library(ggplot2)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(corrplot)
## corrplot 0.92 loaded
library(caret)
## Loading required package: lattice
library(glmnet)
## Loading required package: Matrix
## Loaded glmnet 4.1-4
library(leaps)
library(reshape2)
library(gridExtra)
library(fastDummies)
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(pastecs)
library(skimr)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.6 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ psych::%+%() masks ggplot2::%+%()
## ✖ psych::alpha() masks ggplot2::alpha()
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ tidyr::extract() masks pastecs::extract()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks pastecs::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::last() masks pastecs::last()
## ✖ purrr::lift() masks caret::lift()
## ✖ tidyr::pack() masks Matrix::pack()
## ✖ tidyr::unpack() masks Matrix::unpack()
library(caret)
library(caretEnsemble)
##
## Attaching package: 'caretEnsemble'
##
## The following object is masked from 'package:ggplot2':
##
## autoplot
library(psych)
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.8.1, built: 2022-11-18)
## ## Copyright (C) 2005-2022 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(mice)
##
## Attaching package: 'mice'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(rpart)
library(randomForest)
## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:gridExtra':
##
## combine
##
## The following object is masked from 'package:psych':
##
## outlier
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(nnet)
library(ROCR)
library(Metrics)
##
## Attaching package: 'Metrics'
##
## The following objects are masked from 'package:caret':
##
## precision, recall
library(caret)
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
##
## Attaching package: 'forecast'
##
## The following object is masked from 'package:Metrics':
##
## accuracy
##
## The following object is masked from 'package:caretEnsemble':
##
## autoplot
library(rpart)
library(rattle)
## Loading required package: bitops
##
## Attaching package: 'bitops'
##
## The following object is masked from 'package:Matrix':
##
## %&%
##
## Rattle: A free graphical interface for data science with R.
## Version 5.5.1 Copyright (c) 2006-2021 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
##
## Attaching package: 'rattle'
##
## The following object is masked from 'package:randomForest':
##
## importance
library(ggplot2)
library(plyr)
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
##
## Attaching package: 'plyr'
##
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following object is masked from 'package:purrr':
##
## compact
library(rlist)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following object is masked from 'package:Metrics':
##
## auc
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(ROSE)
## Loaded ROSE 0.0-4
library(plotly)
##
## Attaching package: 'plotly'
##
## The following objects are masked from 'package:plyr':
##
## arrange, mutate, rename, summarise
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(rattle)
library(rpart.plot)
library(RColorBrewer)
LOADING/READ FILE
df = readr::read_csv("caravan-insurance-challenge.csv", show_col_types = FALSE)
dim(df)
## [1] 9822 87
Total number of observations are 9822 Total number of variables are 86 ignoring the 1st column which is meaningless for our data
skim(df)
| Name | df |
| Number of rows | 9822 |
| Number of columns | 87 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 86 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| ORIGIN | 0 | 1 | 4 | 5 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| MOSTYPE | 0 | 1 | 24.25 | 12.92 | 1 | 10 | 30 | 35 | 41 | ▆▃▃▇▇ |
| MAANTHUI | 0 | 1 | 1.11 | 0.41 | 1 | 1 | 1 | 1 | 10 | ▇▁▁▁▁ |
| MGEMOMV | 0 | 1 | 2.68 | 0.78 | 1 | 2 | 3 | 3 | 6 | ▇▇▂▁▁ |
| MGEMLEEF | 0 | 1 | 3.00 | 0.80 | 1 | 2 | 3 | 3 | 6 | ▃▇▃▁▁ |
| MOSHOOFD | 0 | 1 | 5.78 | 2.87 | 1 | 3 | 7 | 8 | 10 | ▃▃▃▇▃ |
| MGODRK | 0 | 1 | 0.70 | 1.02 | 0 | 0 | 0 | 1 | 9 | ▇▂▁▁▁ |
| MGODPR | 0 | 1 | 4.64 | 1.72 | 0 | 4 | 5 | 6 | 9 | ▁▂▇▃▁ |
| MGODOV | 0 | 1 | 1.05 | 1.01 | 0 | 0 | 1 | 2 | 5 | ▇▃▁▁▁ |
| MGODGE | 0 | 1 | 3.26 | 1.61 | 0 | 2 | 3 | 4 | 9 | ▂▇▇▁▁ |
| MRELGE | 0 | 1 | 6.19 | 1.90 | 0 | 5 | 6 | 7 | 9 | ▁▁▃▇▃ |
| MRELSA | 0 | 1 | 0.87 | 0.96 | 0 | 0 | 1 | 1 | 7 | ▇▂▁▁▁ |
| MRELOV | 0 | 1 | 2.29 | 1.71 | 0 | 1 | 2 | 3 | 9 | ▅▇▂▁▁ |
| MFALLEEN | 0 | 1 | 1.89 | 1.78 | 0 | 0 | 2 | 3 | 9 | ▇▆▂▁▁ |
| MFGEKIND | 0 | 1 | 3.24 | 1.61 | 0 | 2 | 3 | 4 | 9 | ▂▇▆▁▁ |
| MFWEKIND | 0 | 1 | 4.30 | 1.98 | 0 | 3 | 4 | 6 | 9 | ▂▆▇▃▁ |
| MOPLHOOG | 0 | 1 | 1.48 | 1.65 | 0 | 0 | 1 | 2 | 9 | ▇▃▁▁▁ |
| MOPLMIDD | 0 | 1 | 3.31 | 1.72 | 0 | 2 | 3 | 4 | 9 | ▃▇▇▂▁ |
| MOPLLAAG | 0 | 1 | 4.59 | 2.28 | 0 | 3 | 5 | 6 | 9 | ▂▆▇▆▂ |
| MBERHOOG | 0 | 1 | 1.90 | 1.81 | 0 | 0 | 2 | 3 | 9 | ▇▆▂▁▁ |
| MBERZELF | 0 | 1 | 0.40 | 0.79 | 0 | 0 | 0 | 1 | 5 | ▇▁▁▁▁ |
| MBERBOER | 0 | 1 | 0.55 | 1.11 | 0 | 0 | 0 | 1 | 9 | ▇▁▁▁▁ |
| MBERMIDD | 0 | 1 | 2.88 | 1.85 | 0 | 2 | 3 | 4 | 9 | ▃▇▃▁▁ |
| MBERARBG | 0 | 1 | 2.23 | 1.75 | 0 | 1 | 2 | 3 | 9 | ▆▇▃▁▁ |
| MBERARBO | 0 | 1 | 2.29 | 1.68 | 0 | 1 | 2 | 3 | 9 | ▆▇▃▁▁ |
| MSKA | 0 | 1 | 1.65 | 1.74 | 0 | 0 | 1 | 2 | 9 | ▇▅▁▁▁ |
| MSKB1 | 0 | 1 | 1.60 | 1.32 | 0 | 1 | 2 | 2 | 9 | ▇▇▁▁▁ |
| MSKB2 | 0 | 1 | 2.20 | 1.53 | 0 | 1 | 2 | 3 | 9 | ▅▇▃▁▁ |
| MSKC | 0 | 1 | 3.74 | 1.94 | 0 | 2 | 4 | 5 | 9 | ▂▇▇▂▁ |
| MSKD | 0 | 1 | 1.07 | 1.30 | 0 | 0 | 1 | 2 | 9 | ▇▂▁▁▁ |
| MHHUUR | 0 | 1 | 4.19 | 3.09 | 0 | 2 | 4 | 7 | 9 | ▇▇▆▅▇ |
| MHKOOP | 0 | 1 | 4.82 | 3.09 | 0 | 2 | 5 | 7 | 9 | ▇▅▆▇▇ |
| MAUT1 | 0 | 1 | 6.02 | 1.54 | 0 | 5 | 6 | 7 | 9 | ▁▁▅▇▂ |
| MAUT2 | 0 | 1 | 1.34 | 1.21 | 0 | 0 | 1 | 2 | 9 | ▇▅▁▁▁ |
| MAUT0 | 0 | 1 | 1.96 | 1.60 | 0 | 0 | 2 | 3 | 9 | ▇▇▂▁▁ |
| MZFONDS | 0 | 1 | 6.25 | 2.00 | 0 | 5 | 7 | 8 | 9 | ▁▂▅▇▅ |
| MZPART | 0 | 1 | 2.75 | 2.00 | 0 | 1 | 2 | 4 | 9 | ▅▇▅▂▁ |
| MINKM30 | 0 | 1 | 2.58 | 2.07 | 0 | 1 | 2 | 4 | 9 | ▇▇▅▂▁ |
| MINK3045 | 0 | 1 | 3.51 | 1.87 | 0 | 2 | 4 | 5 | 9 | ▃▇▇▂▁ |
| MINK4575 | 0 | 1 | 2.74 | 1.95 | 0 | 1 | 3 | 4 | 9 | ▅▇▅▁▁ |
| MINK7512 | 0 | 1 | 0.81 | 1.17 | 0 | 0 | 0 | 1 | 9 | ▇▂▁▁▁ |
| MINK123M | 0 | 1 | 0.21 | 0.56 | 0 | 0 | 0 | 0 | 9 | ▇▁▁▁▁ |
| MINKGEM | 0 | 1 | 3.80 | 1.33 | 0 | 3 | 4 | 4 | 9 | ▁▇▇▂▁ |
| MKOOPKLA | 0 | 1 | 4.26 | 2.00 | 1 | 3 | 4 | 6 | 8 | ▅▇▇▅▅ |
| PWAPART | 0 | 1 | 0.76 | 0.96 | 0 | 0 | 0 | 2 | 3 | ▇▁▁▅▁ |
| PWABEDR | 0 | 1 | 0.04 | 0.36 | 0 | 0 | 0 | 0 | 6 | ▇▁▁▁▁ |
| PWALAND | 0 | 1 | 0.07 | 0.51 | 0 | 0 | 0 | 0 | 4 | ▇▁▁▁▁ |
| PPERSAUT | 0 | 1 | 2.96 | 2.92 | 0 | 0 | 5 | 6 | 9 | ▇▁▂▆▁ |
| PBESAUT | 0 | 1 | 0.05 | 0.57 | 0 | 0 | 0 | 0 | 7 | ▇▁▁▁▁ |
| PMOTSCO | 0 | 1 | 0.17 | 0.89 | 0 | 0 | 0 | 0 | 7 | ▇▁▁▁▁ |
| PVRAAUT | 0 | 1 | 0.01 | 0.24 | 0 | 0 | 0 | 0 | 9 | ▇▁▁▁▁ |
| PAANHANG | 0 | 1 | 0.02 | 0.20 | 0 | 0 | 0 | 0 | 5 | ▇▁▁▁▁ |
| PTRACTOR | 0 | 1 | 0.09 | 0.60 | 0 | 0 | 0 | 0 | 7 | ▇▁▁▁▁ |
| PWERKT | 0 | 1 | 0.01 | 0.22 | 0 | 0 | 0 | 0 | 6 | ▇▁▁▁▁ |
| PBROM | 0 | 1 | 0.22 | 0.81 | 0 | 0 | 0 | 0 | 6 | ▇▁▁▁▁ |
| PLEVEN | 0 | 1 | 0.20 | 0.91 | 0 | 0 | 0 | 0 | 9 | ▇▁▁▁▁ |
| PPERSONG | 0 | 1 | 0.01 | 0.19 | 0 | 0 | 0 | 0 | 6 | ▇▁▁▁▁ |
| PGEZONG | 0 | 1 | 0.02 | 0.21 | 0 | 0 | 0 | 0 | 3 | ▇▁▁▁▁ |
| PWAOREG | 0 | 1 | 0.02 | 0.38 | 0 | 0 | 0 | 0 | 7 | ▇▁▁▁▁ |
| PBRAND | 0 | 1 | 1.85 | 1.88 | 0 | 0 | 2 | 4 | 8 | ▇▅▃▁▁ |
| PZEILPL | 0 | 1 | 0.00 | 0.06 | 0 | 0 | 0 | 0 | 3 | ▇▁▁▁▁ |
| PPLEZIER | 0 | 1 | 0.02 | 0.24 | 0 | 0 | 0 | 0 | 6 | ▇▁▁▁▁ |
| PFIETS | 0 | 1 | 0.03 | 0.16 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| PINBOED | 0 | 1 | 0.02 | 0.21 | 0 | 0 | 0 | 0 | 6 | ▇▁▁▁▁ |
| PBYSTAND | 0 | 1 | 0.05 | 0.40 | 0 | 0 | 0 | 0 | 5 | ▇▁▁▁▁ |
| AWAPART | 0 | 1 | 0.40 | 0.49 | 0 | 0 | 0 | 1 | 2 | ▇▁▅▁▁ |
| AWABEDR | 0 | 1 | 0.01 | 0.13 | 0 | 0 | 0 | 0 | 5 | ▇▁▁▁▁ |
| AWALAND | 0 | 1 | 0.02 | 0.14 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| APERSAUT | 0 | 1 | 0.56 | 0.61 | 0 | 0 | 1 | 1 | 12 | ▇▁▁▁▁ |
| ABESAUT | 0 | 1 | 0.01 | 0.13 | 0 | 0 | 0 | 0 | 5 | ▇▁▁▁▁ |
| AMOTSCO | 0 | 1 | 0.04 | 0.22 | 0 | 0 | 0 | 0 | 8 | ▇▁▁▁▁ |
| AVRAAUT | 0 | 1 | 0.00 | 0.07 | 0 | 0 | 0 | 0 | 4 | ▇▁▁▁▁ |
| AAANHANG | 0 | 1 | 0.01 | 0.12 | 0 | 0 | 0 | 0 | 3 | ▇▁▁▁▁ |
| ATRACTOR | 0 | 1 | 0.03 | 0.25 | 0 | 0 | 0 | 0 | 6 | ▇▁▁▁▁ |
| AWERKT | 0 | 1 | 0.01 | 0.11 | 0 | 0 | 0 | 0 | 6 | ▇▁▁▁▁ |
| ABROM | 0 | 1 | 0.07 | 0.27 | 0 | 0 | 0 | 0 | 3 | ▇▁▁▁▁ |
| ALEVEN | 0 | 1 | 0.08 | 0.38 | 0 | 0 | 0 | 0 | 8 | ▇▁▁▁▁ |
| APERSONG | 0 | 1 | 0.00 | 0.07 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| AGEZONG | 0 | 1 | 0.01 | 0.09 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| AWAOREG | 0 | 1 | 0.00 | 0.07 | 0 | 0 | 0 | 0 | 2 | ▇▁▁▁▁ |
| ABRAND | 0 | 1 | 0.57 | 0.56 | 0 | 0 | 1 | 1 | 7 | ▇▁▁▁▁ |
| AZEILPL | 0 | 1 | 0.00 | 0.03 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
| APLEZIER | 0 | 1 | 0.01 | 0.08 | 0 | 0 | 0 | 0 | 2 | ▇▁▁▁▁ |
| AFIETS | 0 | 1 | 0.03 | 0.21 | 0 | 0 | 0 | 0 | 4 | ▇▁▁▁▁ |
| AINBOED | 0 | 1 | 0.01 | 0.09 | 0 | 0 | 0 | 0 | 2 | ▇▁▁▁▁ |
| ABYSTAND | 0 | 1 | 0.01 | 0.12 | 0 | 0 | 0 | 0 | 2 | ▇▁▁▁▁ |
| CARAVAN | 0 | 1 | 0.06 | 0.24 | 0 | 0 | 0 | 0 | 1 | ▇▁▁▁▁ |
There are no missing values in our data or NANs [Not A Number]
Uninsured =length(which(df$CARAVAN == 0))
Insured = length(which(df$CARAVAN == 1))
We use this function to find out the total count of rows where customers were either insured = 1 or uninsured = 0 for Caravan Insurance Only
frame = data.frame(Policy_Status = factor(c("Uninsured","Insured"), levels=c("Uninsured","Insured")), Count = c(Uninsured,Insured))
plot_ly(frame, x = ~Policy_Status, y = ~Count, type = "bar", color = frame$Policy_Status, colors = c("Purple", "Gold")) %>%
layout(title = "<b>Insuraned vs Uninsured<b>", legend = list(title = list(text ='<b> Policy Status </b>')))
From the below graph we mapped those who were insured for Caravan insurance against those who were not. We see that the count of Uninsured = 9236 vs Insured = 586 This led us to think about the factors why such a large amount of customers are not Caravan insured vs those who were Hence, our research question, to figure out the characteristics of those who have Caravan Insurance
ggplot(df,aes(factor(df$MOSHOOFD))) +
geom_bar(aes(fill = factor(df$CARAVAN))) +
labs(x="Customer Main type") +
scale_fill_discrete(name = "CARAVAN") +
ggtitle("Caravan Policy based on Customer Main Type") +
theme(plot.title = element_text(hjust = 0.5))
df$maintype = df$MOSHOOFD
nrow(df[df$maintype == 1 & df$CARAVAN == 1,])
## [1] 75
nrow(df[df$maintype == 2 & df$CARAVAN == 1,])
## [1] 103
nrow(df[df$maintype == 3 & df$CARAVAN == 1,])
## [1] 109
nrow(df[df$maintype == 4 & df$CARAVAN == 1,])
## [1] 0
nrow(df[df$maintype == 5 & df$CARAVAN == 1,])
## [1] 18
nrow(df[df$maintype == 6 & df$CARAVAN == 1,])
## [1] 9
nrow(df[df$maintype == 7 & df$CARAVAN == 1,])
## [1] 35
nrow(df[df$maintype == 8 & df$CARAVAN == 1,])
## [1] 151
nrow(df[df$maintype == 9 & df$CARAVAN == 1,])
## [1] 75
nrow(df[df$maintype == 10 & df$CARAVAN == 1,])
## [1] 11
Here we wanted to see the which customer main type has the highest frequency/count of buying the insurance. Based on results, we see that there are atleast 4 main customer categories that buy insurance. However, for our ease and understanding purposes we will only consider the top 2. This brings us to select, category number 8 and 3, where 8 = Family with grown ups and 3 = Driven Growths
ggplot(df,aes(factor(df$MOSTYPE))) +
geom_bar(aes(fill = factor(df$CARAVAN))) +
labs(x="Customer Sub type") +
scale_fill_discrete(name = "CARAVAN") +
ggtitle("Policy Bought based on Customer sub Type") +
theme(plot.title = element_text(hjust = 0.5))
df$subtype = df$MOSTYPE
nrow(df[df$subtype == 33 & df$CARAVAN == 1,])
## [1] 80
nrow(df[df$subtype == 8 & df$CARAVAN == 1,])
## [1] 72
category 33 and 8 purchased more policies. Based on our main category which compromises of various sub-categories, we can see that sub-categories number 33 and 8 are porne to buying insurance. These should be considered as the characteristics/attributes of the types of customers that exist in the main customer category. hence, we could say that, those who are middle class and those who are low class but have large families have a higher chance of getting the insurance
ggplot(df,aes(factor(df$MGEMLEEF))) +
geom_bar(aes(fill = factor(df$CARAVAN))) +
geom_text(stat='count', aes(label=..count..), vjust=0) +
labs(x="Age Group") +
scale_fill_discrete(name = "CARAVAN") +
ggtitle("Policy bought on age group") +
theme(plot.title = element_text(hjust = 0.5))
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
df$age = df$MGEMLEEF
nrow(df[df$age == 1 & df$CARAVAN == 1,])
## [1] 1
nrow(df[df$age == 2 & df$CARAVAN == 1,])
## [1] 156
nrow(df[df$age == 3 & df$CARAVAN == 1,])
## [1] 303
nrow(df[df$age == 4 & df$CARAVAN == 1,])
## [1] 105
nrow(df[df$age == 5 & df$CARAVAN == 1,])
## [1] 20
nrow(df[df$age == 6 & df$CARAVAN == 1,])
## [1] 1
Here we have explored to see what is the age range of the customers that buy the insurance. Based on our analysis we see that customers who are between the ages 40-50 are prone to buying insurance compared with others. Hence, we could say that it is among the many characteristics of the main customer group [3,8]
ggplot(df,aes(factor(df$MAANTHUI))) +
geom_bar(aes(fill = factor(df$CARAVAN))) +
geom_text(stat='count', aes(label=..count..), vjust=0) +
labs(x="Number of houses") +
scale_fill_discrete(name = "CARAVAN") +
ggtitle("Number of houses customer has who bought insurance") +
theme(plot.title = element_text(hjust = 0.5))
df$noofhouses = df$MAANTHUI
nrow(df[df$noofhouses == 1 & df$CARAVAN == 1,])
## [1] 526
nrow(df[df$noofhouses == 2 & df$CARAVAN == 1,])
## [1] 59
nrow(df[df$noofhouses == 3 & df$CARAVAN == 1,])
## [1] 1
Now we wanted to see who is prone to getting an insurance with respect to number of houses and we have found that customers having at least 1 house are likely to get the insurance.
ggplot(df,aes(factor(df$MGEMOMV))) +
geom_bar(aes(fill = factor(df$CARAVAN))) +
geom_text(stat='count', aes(label=..count..), vjust=0) +
labs(x="Number of house hold") +
scale_fill_discrete(name = "CARAVAN") +
ggtitle("Number of house hold") +
theme(plot.title = element_text(hjust = 0.5))
df$noofhousehold = df$MGEMOMV
nrow(df[df$noofhousehold == 1 & df$CARAVAN == 1,])
## [1] 11
nrow(df[df$noofhousehold == 2 & df$CARAVAN == 1,])
## [1] 195
nrow(df[df$noofhousehold == 3 & df$CARAVAN == 1,])
## [1] 275
df$hasThreeHouseHold = ifelse(df$noofhousehold == 3, 1, 0)
Now we wanted to see who is prone to getting an insurance with respect to number of houses and we have found that customers having at least 1 house are likely to get the insurance.
Characteristics we found so far
1. Customer having 3 house hold
2. Customer have one house, Age of customer is between 40 to 50
3. Customer are Driven Growers
4. Customer belongs to Lower class large families
corrplot(cor(df[, c("subtype","maintype", "age", "noofhouses", "noofhousehold", "CARAVAN")]), method = "number")
From the correlation matrix we have below, we have some interesting
insights. The reason to run the correlation matrix was to figure out
variables of interest which might cause either over-fitting or
under-fitting. # Here we can see that the variables of interest are
positively correlated. One exception to this is the correlation between
age and number of households. We see that there is a negative
correlation between it. Which actually makes sense because, the greater
the age, the no of households will decrease.
table(df$MAANTHUI)
##
## 1 2 3 4 5 6 7 8 10
## 8915 821 64 4 3 3 8 2 2
df$MAANTHUI = replace(df$MAANTHUI, df$MAANTHUI > 2, 2)
df$OneHouse = ifelse(df$MAANTHUI ==1, 1, 0)
df$moreThanTwoHouse = ifelse(df$MAANTHUI > 2, 1, 0)
by looking frequencies of houses we categorized houses into dummies. As we already saw from the above bar graph that customers having at least 1 house are likely to buy insurance, we thought it would be a good idea to create dummies in a binary way instead of having many different ranges. That is why we have done this step.
df$averageFamily = ifelse(df$MOSTYPE %in% c(12,11,9,10,13), 1, 0)
df$loners = ifelse(df$MOSTYPE %in% c(17,15,18,16,19), 1, 0)
df$conservativeFamilies = ifelse(df$MOSTYPE %in% c(39,38), 1, 0)
df$crusingSeniors = ifelse(df$MOSTYPE %in% c(26,25,28,27), 1, 0)
df$drivenGrowers = ifelse(df$MOSTYPE %in% c(6,7,8), 1, 0)
df$grownups = ifelse(df$MOSTYPE %in% c(33,34,35,36,37), 1, 0)
df$framers = ifelse(df$MOSTYPE %in% c(40,41), 1, 0)
df$livingWell = ifelse(df$MOSTYPE %in% c(20,21,22,23,24), 1, 0)
df$retired = ifelse(df$MOSTYPE %in% c(29,30,31,32), 1, 0)
df$successful = ifelse(df$MOSTYPE %in% c(1,2,3,4,5), 1, 0)
dat <- data.frame(
Categorized_Customers = factor(c("averageFamily", "loners", "conservativeFamilies", "crusingSeniors", "drivenGrowers", "grownups", "framers", "livingWell", "retired", "successful"), levels=c("averageFamily", "loners", "conservativeFamilies", "crusingSeniors", "drivenGrowers", "grownups", "framers", "livingWell", "retired", "successful")),
Count = c( sum(df$averageFamily), sum(df$loners), sum(df$conservativeFamilies), sum(df$crusingSeniors), sum(df$drivenGrowers), sum(df$grownups), sum(df$framers), sum(df$livingWell), sum(df$retired), sum(df$successful) )
)
plot_ly(dat, x = ~Categorized_Customers, y = ~Count, type = 'bar', color = dat$Categorized_Customers) %>%
layout(title = "<b>Customer Types<b>", legend = list(title = list(text ='<b> Types </b>')))
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
## Warning in RColorBrewer::brewer.pal(N, "Set2"): n too large, allowed maximum for palette Set2 is 8
## Returning the palette you asked for with that many colors
We have 10 customer main types and 41 customer sub-types. A leveled approach was to consider merging them into one bucket based on sub-category. Since one of our goal is to find the characteristics of customers, we believe that the customer sub-type are the characteristics. However, when we look at customer main type alone, we cannot figure out the characteristics of people in this group. Hence, we merged sub-categories with main categories.
An advantage of doing this is that when we look at customer main type, we would automatically know what are the characteristics of this group - explained by the sub types within this group
After having done so, we constructed a bar chart and observed the following:
# Converting 30k income into value
df$MINKM30_c = ifelse(df$MINKM30 == 1, 0.05 * 30000, df$MINKM30)
df$MINKM30_c = ifelse(df$MINKM30_c == 2, 0.17 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 3, 0.3 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 4, 0.43 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 5, 0.56 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 6, 0.69 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 7, 0.82 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 8, 0.94 * 30000, df$MINKM30_c)
df$MINKM30_c = ifelse(df$MINKM30_c == 9, 1 * 30000, df$MINKM30_c)
# Converting 45k income into value
df$MINK3045_c = ifelse(df$MINK3045 == 1, 0.05 * 45000, df$MINK3045)
df$MINK3045_c = ifelse(df$MINK3045_c == 2, 0.17 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 3, 0.3 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 4, 0.43 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 5, 0.56 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 6, 0.69 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 7, 0.82 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 8, 0.94 * 45000, df$MINK3045_c)
df$MINK3045_c = ifelse(df$MINK3045_c == 9, 1 * 45000, df$MINK3045_c)
# Converting 70k income into value
df$MINK4575_c = ifelse(df$MINK4575 == 1, 0.05 * 75000, df$MINK4575)
df$MINK4575_c = ifelse(df$MINK4575_c == 2, 0.17 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 3, 0.3 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 4, 0.43 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 5, 0.56 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 6, 0.69 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 7, 0.82 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 8, 0.94 * 75000, df$MINK4575_c)
df$MINK4575_c = ifelse(df$MINK4575_c == 9, 1 * 75000, df$MINK4575_c)
# Converting 122k income into value
df$MINK7512_c = ifelse(df$MINK7512 == 1, 0.05 * 122000, df$MINK7512)
df$MINK7512_c = ifelse(df$MINK7512_c == 2, 0.17 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 3, 0.3 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 4, 0.43 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 5, 0.56 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 6, 0.69 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 7, 0.82 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 8, 0.94 * 122000, df$MINK7512_c)
df$MINK7512_c = ifelse(df$MINK7512_c == 9, 1 * 122000, df$MINK7512_c)
# Converting 123k income into value
df$MINK123M_c = ifelse(df$MINK123M == 1, 0.05 * 123000, df$MINK123M)
df$MINK123M_c = ifelse(df$MINK123M_c == 2, 0.17 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 3, 0.3 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 4, 0.43 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 5, 0.56 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 6, 0.69 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 7, 0.82 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 8, 0.94 * 123000, df$MINK123M_c)
df$MINK123M_c = ifelse(df$MINK123M_c == 9, 1 * 123000, df$MINK123M_c)
# Average income
df$MINKGEM_c = (df$MINK123M_c + df$MINK7512_c + df$MINK4575_c + df$MINK3045_c + df$MINKM30_c)/5
plot_ly(x = ~df$MINKGEM_c, type = "histogram", color = df$MINKGEM_c, colors = c("gold", "blue", "green", "pink", "brown")) %>%
layout(title = "<b>Income Levels<b>")
## Warning: textfont.color doesn't (yet) support data arrays
## Warning in min(x, na.rm = na.rm): no non-missing arguments to min; returning Inf
## Warning in max(x, na.rm = na.rm): no non-missing arguments to max; returning
## -Inf
## Warning: textfont.color doesn't (yet) support data arrays
Since our data is categorical and gives us levels, after having discussed with the professor, we agreed that a good approach would be convert the income as a categorical variable to numeric one.
In order to do that, we had to decide a method by which we would do so. The method was decided, mapped out in our excel file
Here we just brought the calculations done in excel
The idea was to take the average of income for each category
From the following histogram we can see that most of customers are skewed towards left but we can see in detail as well that most customers are between 5k - 20K
df$MGEMLEEF_c = ifelse(df$MGEMLEEF == 1, 25, df$MGEMLEEF)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 2, 35, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 3, 45, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 4, 55, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 5, 65, df$MGEMLEEF_c)
df$MGEMLEEF_c = ifelse(df$MGEMLEEF_c == 4, 75, df$MGEMLEEF_c)
Just as we did for income, we did the same thing for age well
The idea and logic behind it was the same as it was behind converting income to numeric
We believe that age should be numeric because it is best explained when numeric
# MOSTYPE : customer subtype
# MFWEKIND: Household with children
# MOPLLAAG: Lower level education
# MHHUUR : Rented house
# MHKOOP : Home owners
# MINKM30: Income < 30.000 low income
# MINK7512: Income 75-122.000 high income
# MKOOPKLA: Purchasing power class
# PPERSAUT: Contribution car policies
# CARAVAN: Number of mobile home policies 0 - 1
plot_ly(x = ~df$MINKM30_c, y = ~df$MOSTYPE, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
After having converted the categorical variable for income to numerical, here we take income less than 3000 against customer sub-categories and draw a box plot. From the below box plot, we can see that as soon as income crosses 10K, we begint to see a few outliers for certain sub-categories. Another important thing to note is that when income jumps above 20K, the distance of outliers starts to increase. We could say that we are seeing extreme outliers in income levels ranging from 25K to 30K.
Now the question is if we should keep outliers in our analysis, whether mild or extreme or delete both of them and then proceed?
plot_ly(y = ~df$MOSTYPE, x = ~df$MINK3045_c, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
plot_ly(x = df$MINK4575_c, y = df$MOSTYPE, type = "box", color = df$MOSTYPE)
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
## Warning: line.color doesn't (yet) support data arrays
## Warning: Only one fillcolor per trace allowed
prop.table(table(df$CARAVAN))
##
## 0 1
## 0.94033802 0.05966198
When drew a frequency table, we saw that we have a class imbalance problem as 94% of the records in the CARAVAN column are 0s while only 5% of them are 1s. This is a problem which needs to be solved first before we begin further
barplot(prop.table(table(df$CARAVAN)), col = rainbow(2), ylim = c(0,1), main = "Class Distribution")
Here we have shown how the distribution is happening through a bar
chart
# training data
df_train = (df[df$ORIGIN == "train",])
df_train = (df_train[,-1])
table(df_train$CARAVAN)
##
## 0 1
## 5474 348
#testing data
df_test = (df[df$ORIGIN == "test",])
df_test = (df_test[,-1])
nrow(df_test)
## [1] 4000
table(df_test$CARAVAN)
##
## 0 1
## 3762 238
over_train = ovun.sample(CARAVAN ~ ., data =df_train, method = "over", N =10948)$data
table(over_train$CARAVAN)
##
## 0 1
## 5474 5474
over_test = ovun.sample(CARAVAN ~ ., data =df_test, method = "over", N =nrow(df_test))$data
table(over_test$CARAVAN)
##
## 0 1
## 3762 238
we are not fixing sampling problem in test data, we did this because we were having error while doing prediction
for(i in 1:ncol(over_train)){
over_train[,i] <- as.factor(over_train[,i])
}
In the followng code, since our entire data is categorical we have converted all of it into factors, for training data
for(i in 1:ncol(over_test)){
over_test[,i] <- as.factor(over_test[,i])
}
In the followng code, since our entire data is categorical we have converted all of it into factors, for test data as well
over_test$MINKGEM_c = as.numeric(over_test$MINKGEM_c)
over_train$MINKGEM_c = as.numeric(over_train$MINKGEM_c)
over_test$MGEMLEEF_c = as.numeric(over_test$MGEMLEEF_c)
over_train$MGEMLEEF_c = as.numeric(over_train$MGEMLEEF_c)
We realized that income should not be as a factor since we converted so we just reconverted back to numeric
drewSummary = function(model) {
summary(model)
}
drewMatrix = function(model, test_data) {
predicted = predict(model, test_data, type = "response")
predictedClass = ifelse(predicted>=0.5, 1, 0)
confusionMatrix(as.factor(predictedClass), as.factor(test_data$CARAVAN), positive = "1")
}
drewAnova = function(model1, model2){
anova(model1, model2, test = 'Chisq')
}
drewROC = function(model){
predicted = predict(model, over_test, type = "response")
predictedClass = ifelse(predicted>=0.5, 1, 0)
r = roc(over_test$CARAVAN, predictedClass)
plot.roc(r)
}
getRMSE = function(predictedClass){
accuracy(predictedClass, as.numeric(over_test$CARAVAN))[2]
}
new_data = over_train
new_data$MOSHOOFD = as.numeric(new_data$MOSHOOFD)
new_data$MGEMOMV = as.numeric(new_data$MGEMOMV)
new_data$MINKGEM = as.numeric(new_data$MINKGEM)
new_data$MGEMLEEF = as.numeric(new_data$MGEMLEEF)
new_data$CARAVAN = as.numeric(new_data$CARAVAN)
new_data$OneHouse = as.numeric(new_data$OneHouse)
corrplot(cor(subset(new_data , select = c("MOSHOOFD", "MGEMOMV", "OneHouse", "MINKGEM", "MGEMLEEF", "CARAVAN"))), method = "number", type = "upper")
We have performed a correlation matrix to determine the factors for
model one. The variables are those which we have already used above. The
correlation matrix tells us which variables are important to least
important
set.seed(123)
logit.reg = glm(CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c+MGEMLEEF_c, data = over_train, family = binomial (link = "logit"))
logit.reg$xlevels[["MGEMOMV"]] <- union(logit.reg$xlevels[["MGEMOMV"]], levels(over_test$MGEMOMV))
drewSummary(logit.reg)
##
## Call:
## glm(formula = CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c +
## MGEMLEEF_c, family = binomial(link = "logit"), data = over_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7524 -1.1471 0.3593 1.1508 1.8673
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.623655 0.204826 -3.045 0.00233 **
## MOSHOOFD2 0.390835 0.085280 4.583 4.58e-06 ***
## MOSHOOFD3 -0.310404 0.081514 -3.808 0.00014 ***
## MOSHOOFD4 -14.597185 120.532051 -0.121 0.90361
## MOSHOOFD5 -0.941544 0.104585 -9.003 < 2e-16 ***
## MOSHOOFD6 -1.246472 0.157307 -7.924 2.30e-15 ***
## MOSHOOFD7 -0.729989 0.095261 -7.663 1.82e-14 ***
## MOSHOOFD8 -0.340075 0.071590 -4.750 2.03e-06 ***
## MOSHOOFD9 -0.142648 0.085464 -1.669 0.09510 .
## MOSHOOFD10 -1.619452 0.144267 -11.225 < 2e-16 ***
## MGEMOMV2 0.353052 0.129042 2.736 0.00622 **
## MGEMOMV3 0.303507 0.131857 2.302 0.02135 *
## MGEMOMV4 0.346560 0.142779 2.427 0.01521 *
## MGEMOMV5 0.183444 0.225311 0.814 0.41554
## OneHouse1 0.123074 0.070856 1.737 0.08239 .
## MINKGEM_c 0.002208 0.000263 8.394 < 2e-16 ***
## MGEMLEEF_c 0.057575 0.030824 1.868 0.06178 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15177 on 10947 degrees of freedom
## Residual deviance: 14363 on 10931 degrees of freedom
## AIC: 14397
##
## Number of Fisher Scoring iterations: 13
drewMatrix(logit.reg, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 315 10
## 1 3447 228
##
## Accuracy : 0.1358
## 95% CI : (0.1253, 0.1468)
## No Information Rate : 0.9405
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0054
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.95798
## Specificity : 0.08373
## Pos Pred Value : 0.06204
## Neg Pred Value : 0.96923
## Prevalence : 0.05950
## Detection Rate : 0.05700
## Detection Prevalence : 0.91875
## Balanced Accuracy : 0.52086
##
## 'Positive' Class : 1
##
drewROC(logit.reg)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
# getRMSE(logit.reg)
# drewAnova(logit.reg)
-we did regression with/out house we see a minimal affect i.e with house model is predicting 60 true positives without it’s predicting 65 and it’s not significant, so we decided not to include this variable
-difference in deviance = Null deviance (15177) - 14398 = 872
-From the below confusion matrix for model 1 we see that the accuracy of the model is 11% but the sensitivity is 98% while specificity is 57%. There could be couple of cases to either consider or discard the model. If specificity is the goal then this model is a good fit otherwise, it is not.
train_2 = over_train
train_2 = subset(over_train, select = -c(maintype,subtype,age,noofhouses,noofhousehold,hasThreeHouseHold,OneHouse,moreThanTwoHouse,averageFamily,loners,conservativeFamilies,crusingSeniors,drivenGrowers,grownups,framers,livingWell,retired,successful,MINKM30_c,MINK3045_c,MINK4575_c,MINK7512_c,MINK123M_c,MINKGEM_c,MGEMLEEF_c,MOSHOOFD,MGEMOMV,MAANTHUI,MINKGEM,MGEMLEEF) )
length(train_2)
## [1] 81
Dropping 5 variables that we have used in first regression. and dummies we have created earlier.
new_df = train_2
for(i in 1:ncol(new_df)){
new_df[,i] <- as.integer(new_df[,i])
}
Now we build model number 2 which will also use correlation matrix but this time we want to remove variables that are highly correlated and get those variables which are less correlated hence have a good impact on the response variable and its predictors. We are removing such variables step by step and explaining why are we doing that.
Pre processing
zv = apply(new_df, 2, function(x) length(unique(x)) == 1)
dfr = new_df[, !zv]
n=length(colnames(dfr))
correlationMatrix = cor(dfr[,1:n],use="complete.obs")
summary(correlationMatrix[upper.tri(correlationMatrix)])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.999760 -0.023458 -0.001316 0.010568 0.036060 0.984158
After removing our suspected predictors we still have strong positive correlation with 1% and strong negative corelation with 99.9%, we need to find which of them are highly correlated
high = findCorrelation(correlationMatrix, cutoff = 0.75, names = TRUE)
high
## [1] "MZPART" "MHHUUR" "MSKA" "MRELGE" "PBRAND" "PWAPART"
## [7] "APERSAUT" "PTRACTOR" "MGODGE" "AWABEDR" "AWALAND" "AAANHANG"
## [13] "PBESAUT" "PZEILPL" "APLEZIER" "PBYSTAND" "AWERKT" "PLEVEN"
## [19] "PGEZONG" "AWAOREG" "PBROM" "PFIETS" "PINBOED" "PMOTSCO"
## [25] "APERSONG" "AVRAAUT"
length(high)
## [1] 26
there are 26 variables which are correlated with each other, before dropping them we need to see how they are correlated with response variable. The reason we why we have chose cut-off value of .75 is because it gives us variables that are strongly correlated with each other
target_cor_df = data.frame(CARAVAN = cor(df_train[,sort(high)], df_train[, "CARAVAN"]))
cor_df = target_cor_df[order(target_cor_df$CARAVAN,decreasing = T),,drop=F]
excludedVariables = row.names(cor_df[cor_df$CARAVAN < 0.1, ,drop=F])
excludedVariables
## [1] "PWAPART" "PBRAND" "MRELGE" "MSKA" "PBYSTAND" "MZPART"
## [7] "PGEZONG" "PFIETS" "AWAOREG" "PLEVEN" "PZEILPL" "AAANHANG"
## [13] "PMOTSCO" "PINBOED" "AWABEDR" "PBESAUT" "APERSONG" "AVRAAUT"
## [19] "PTRACTOR" "AWERKT" "AWALAND" "MGODGE" "PBROM" "MHHUUR"
paste0("excluding total variables from main data set ", length(excludedVariables))
## [1] "excluding total variables from main data set 24"
There are 24 variables which are less correlated with response variable and having correlation coefficient less than 0.1 so we will exclude them. These 24 variables are from 26 variables from the above
train_2 = data.frame(train_2[, !colnames(train_2) %in% excludedVariables])
names(train_2)
## [1] "MOSTYPE" "MGODRK" "MGODPR" "MGODOV" "MRELSA" "MRELOV"
## [7] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
## [13] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
## [19] "MSKB1" "MSKB2" "MSKC" "MSKD" "MHKOOP" "MAUT1"
## [25] "MAUT2" "MAUT0" "MZFONDS" "MINKM30" "MINK3045" "MINK4575"
## [31] "MINK7512" "MINK123M" "MKOOPKLA" "PWABEDR" "PWALAND" "PPERSAUT"
## [37] "PVRAAUT" "PAANHANG" "PWERKT" "PPERSONG" "PWAOREG" "PPLEZIER"
## [43] "AWAPART" "APERSAUT" "ABESAUT" "AMOTSCO" "ATRACTOR" "ABROM"
## [49] "ALEVEN" "AGEZONG" "ABRAND" "AZEILPL" "APLEZIER" "AFIETS"
## [55] "AINBOED" "ABYSTAND" "CARAVAN"
length(train_2)
## [1] 57
# corrplot(cor(train_3), method = "number")
# 24 + 5 (predictors in mode l) = 29
# around 29 variables have been excluded from set so far next step would be to find good predictors which are not highly correlated each other and are significant.
# we have reduce dimension from 86 to 57
24 + 5 (predictors in mode l) = 29
around 29 variables have been excluded from set so far next step would be to find good predictors which are not highly correlated each other and are significant.
we have reduce dimension from 86 to 57
# corelation between no of car policy and carvan
cor(df_train$APERSAUT, df_train$CARAVAN)
## [1] 0.1442105
# # corelation between contribution car policies and carvan
cor(df_train$PPERSAUT, df_train$CARAVAN)
## [1] 0.1509097
# # corelation between Purchasing power class and carvan
cor(df_train$MKOOPKLA, df_train$CARAVAN)
## [1] 0.09593826
new_train_2 = train_2
for(i in 1:ncol(new_train_2)){
new_train_2[,i] <- as.numeric(new_train_2[,i])
}
cor_response = data.frame("ind_var" = colnames(new_train_2), "dep_var" = "CARAVAN", "cor_coeff" = 0, "p_values" = 0)
for (i in colnames(new_train_2)){
cor_test <- cor.test(new_train_2[,i], new_train_2[,"CARAVAN"])
cor_response[cor_response$ind_var == i, "correlation_coefficient"] = cor_test$estimate
cor_response[cor_response$ind_var == i, "p_values"] = cor_test$p.value
}
cor_response[order(cor_response$cor_coeff, decreasing = T),]
## ind_var dep_var cor_coeff p_values correlation_coefficient
## 1 MOSTYPE CARAVAN 0 2.265491e-40 -0.126629109
## 2 MGODRK CARAVAN 0 1.415021e-01 0.014052163
## 3 MGODPR CARAVAN 0 1.257828e-17 0.081563718
## 4 MGODOV CARAVAN 0 2.772500e-01 0.010385020
## 5 MRELSA CARAVAN 0 2.060545e-15 -0.075761616
## 6 MRELOV CARAVAN 0 1.180399e-50 -0.142344374
## 7 MFALLEEN CARAVAN 0 5.309752e-40 -0.126026326
## 8 MFGEKIND CARAVAN 0 2.514008e-02 0.021400986
## 9 MFWEKIND CARAVAN 0 4.201894e-13 0.069204271
## 10 MOPLHOOG CARAVAN 0 5.453918e-69 0.166621631
## 11 MOPLMIDD CARAVAN 0 2.208517e-22 0.092818069
## 12 MOPLLAAG CARAVAN 0 6.059450e-91 -0.191512831
## 13 MBERHOOG CARAVAN 0 7.664487e-45 0.133694736
## 14 MBERZELF CARAVAN 0 3.100937e-12 0.066576997
## 15 MBERBOER CARAVAN 0 1.296953e-39 -0.125391135
## 16 MBERMIDD CARAVAN 0 3.868878e-17 0.080320641
## 17 MBERARBG CARAVAN 0 8.321327e-19 -0.084493667
## 18 MBERARBO CARAVAN 0 6.533712e-35 -0.117411705
## 19 MSKB1 CARAVAN 0 8.844832e-09 0.054933370
## 20 MSKB2 CARAVAN 0 1.736747e-01 0.013003473
## 21 MSKC CARAVAN 0 2.606574e-20 -0.088090245
## 22 MSKD CARAVAN 0 3.109772e-58 -0.152867441
## 23 MHKOOP CARAVAN 0 3.921174e-80 0.179713174
## 24 MAUT1 CARAVAN 0 8.114368e-65 0.161431525
## 25 MAUT2 CARAVAN 0 3.139517e-02 0.020567625
## 26 MAUT0 CARAVAN 0 3.099707e-81 -0.180953606
## 27 MZFONDS CARAVAN 0 4.201169e-38 -0.122885514
## 28 MINKM30 CARAVAN 0 2.684951e-95 -0.196050678
## 29 MINK3045 CARAVAN 0 6.678538e-01 -0.004101420
## 30 MINK4575 CARAVAN 0 1.444118e-43 0.131719061
## 31 MINK7512 CARAVAN 0 5.855925e-42 0.129183471
## 32 MINK123M CARAVAN 0 5.744262e-01 -0.005367394
## 33 MKOOPKLA CARAVAN 0 2.288708e-91 0.191958627
## 34 PWABEDR CARAVAN 0 1.000000e+00 0.000000000
## 35 PWALAND CARAVAN 0 2.092667e-07 -0.049585947
## 36 PPERSAUT CARAVAN 0 6.282637e-306 0.346248808
## 37 PVRAAUT CARAVAN 0 3.485079e-03 -0.027917884
## 38 PAANHANG CARAVAN 0 1.312035e-02 0.023705979
## 39 PWERKT CARAVAN 0 1.875280e-05 -0.040885355
## 40 PPERSONG CARAVAN 0 6.483585e-03 -0.026015928
## 41 PWAOREG CARAVAN 0 4.617519e-06 0.043770091
## 42 PPLEZIER CARAVAN 0 2.923919e-33 0.114476321
## 43 AWAPART CARAVAN 0 4.983805e-82 0.181841434
## 44 APERSAUT CARAVAN 0 5.451683e-238 0.307227802
## 45 ABESAUT CARAVAN 0 6.981609e-02 -0.017328933
## 46 AMOTSCO CARAVAN 0 3.649931e-02 0.019987546
## 47 ATRACTOR CARAVAN 0 8.764842e-06 -0.042473319
## 48 ABROM CARAVAN 0 1.431323e-33 -0.115033748
## 49 ALEVEN CARAVAN 0 1.359462e-12 0.067672943
## 50 AGEZONG CARAVAN 0 5.895091e-08 0.051790072
## 51 ABRAND CARAVAN 0 7.264731e-45 0.133730502
## 52 AZEILPL CARAVAN 0 1.600743e-03 0.030157056
## 53 APLEZIER CARAVAN 0 1.617283e-40 0.126866825
## 54 AFIETS CARAVAN 0 1.934488e-09 0.057333528
## 55 AINBOED CARAVAN 0 7.836628e-03 0.025411780
## 56 ABYSTAND CARAVAN 0 8.302969e-26 0.100145723
## 57 CARAVAN CARAVAN 0 0.000000e+00 1.000000000
Here we are seeing the remaining variable’s and seeing their significance level with our response variables and all of the variables are significant at .05.
Now we will further plot a correlation matrix to make sure there is no co-linearity among the variables with respect to our response variable
corrplot(cor(subset(new_train_2 , select = c(-CARAVAN))), method = "square", type = "upper")
cor(df_train$PPERSAUT, df_train$APERSAUT)
## [1] 0.9161545
cor(df_train[ , c("PPERSAUT", "APERSAUT")], df_train[ , "CARAVAN"])
## CARAVAN
## PPERSAUT 0.1509097
## APERSAUT 0.1442105
train_2 = data.frame(train_2[ , !colnames(train_2) %in% c("APERSAUT")])
paste0("after removing APERSAUT dimension is", length(train_2))
## [1] "after removing APERSAUT dimension is56"
There is a high correlation between car policies and number of car policies which we will exclude as variables which have less correlation with response variable. It’s not always necessary to see how much it relates which response variable but it’s good as if it tells us how much response variable changes for given predictor. Contribution car policies is more correlated with response variable so we exclude Number of car policies
# final variable selected
names(train_2)
## [1] "MOSTYPE" "MGODRK" "MGODPR" "MGODOV" "MRELSA" "MRELOV"
## [7] "MFALLEEN" "MFGEKIND" "MFWEKIND" "MOPLHOOG" "MOPLMIDD" "MOPLLAAG"
## [13] "MBERHOOG" "MBERZELF" "MBERBOER" "MBERMIDD" "MBERARBG" "MBERARBO"
## [19] "MSKB1" "MSKB2" "MSKC" "MSKD" "MHKOOP" "MAUT1"
## [25] "MAUT2" "MAUT0" "MZFONDS" "MINKM30" "MINK3045" "MINK4575"
## [31] "MINK7512" "MINK123M" "MKOOPKLA" "PWABEDR" "PWALAND" "PPERSAUT"
## [37] "PVRAAUT" "PAANHANG" "PWERKT" "PPERSONG" "PWAOREG" "PPLEZIER"
## [43] "AWAPART" "ABESAUT" "AMOTSCO" "ATRACTOR" "ABROM" "ALEVEN"
## [49] "AGEZONG" "ABRAND" "AZEILPL" "APLEZIER" "AFIETS" "AINBOED"
## [55] "ABYSTAND" "CARAVAN"
length(train_2)
## [1] 56
After having removed all the variables which are highly correlated with each other we are left with 56 variables of importance
step.wise1 = glm(CARAVAN ~ ., data = train_2, family = binomial(link = "logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(step.wise1)
##
## Call:
## glm(formula = CARAVAN ~ ., family = binomial(link = "logit"),
## data = train_2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.7081 -0.5318 0.0001 0.6927 2.0713
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.450e+00 4.510e+03 -0.001 0.999213
## MOSTYPE2 2.357e-01 8.749e-01 0.269 0.787613
## MOSTYPE3 8.912e-01 8.058e-01 1.106 0.268786
## MOSTYPE4 -9.496e-01 8.834e-01 -1.075 0.282404
## MOSTYPE5 5.134e+00 1.963e+00 2.616 0.008901 **
## MOSTYPE6 6.354e-01 3.261e-01 1.948 0.051363 .
## MOSTYPE7 -3.775e-01 8.469e-01 -0.446 0.655804
## MOSTYPE8 1.467e+00 5.691e-01 2.577 0.009970 **
## MOSTYPE9 6.266e+00 1.889e+00 3.317 0.000909 ***
## MOSTYPE10 3.363e-02 3.194e-01 0.105 0.916135
## MOSTYPE11 -2.808e-01 7.973e-01 -0.352 0.724703
## MOSTYPE12 2.043e+00 5.871e-01 3.480 0.000502 ***
## MOSTYPE13 4.911e-01 7.989e-01 0.615 0.538759
## MOSTYPE15 -2.487e+01 3.096e+03 -0.008 0.993591
## MOSTYPE16 -2.241e+01 1.718e+03 -0.013 0.989590
## MOSTYPE17 -1.674e+01 1.825e+03 -0.009 0.992684
## MOSTYPE18 -2.465e+01 1.596e+03 -0.015 0.987679
## MOSTYPE19 -1.342e+01 2.911e+03 -0.005 0.996322
## MOSTYPE20 -1.174e+01 1.353e+03 -0.009 0.993077
## MOSTYPE21 -2.475e+01 1.728e+03 -0.014 0.988573
## MOSTYPE22 -1.161e+01 1.353e+03 -0.009 0.993153
## MOSTYPE23 5.188e+00 1.939e+00 2.676 0.007448 **
## MOSTYPE24 -1.160e+01 1.353e+03 -0.009 0.993162
## MOSTYPE25 -1.120e+01 1.353e+03 -0.008 0.993398
## MOSTYPE26 -1.003e+01 1.353e+03 -0.007 0.994083
## MOSTYPE27 -1.262e+01 1.353e+03 -0.009 0.992558
## MOSTYPE28 -3.110e+01 2.383e+03 -0.013 0.989587
## MOSTYPE29 6.729e+00 1.967e+00 3.421 0.000623 ***
## MOSTYPE30 -1.006e+01 1.353e+03 -0.007 0.994065
## MOSTYPE31 -9.197e+00 1.353e+03 -0.007 0.994577
## MOSTYPE32 -8.997e+00 1.353e+03 -0.007 0.994694
## MOSTYPE33 7.609e+00 1.938e+00 3.927 8.62e-05 ***
## MOSTYPE34 7.385e-01 8.150e-01 0.906 0.364913
## MOSTYPE35 5.400e+00 1.849e+00 2.920 0.003496 **
## MOSTYPE36 7.741e+00 1.942e+00 3.986 6.72e-05 ***
## MOSTYPE37 7.191e+00 1.897e+00 3.792 0.000150 ***
## MOSTYPE38 7.522e+00 1.888e+00 3.984 6.77e-05 ***
## MOSTYPE39 6.441e+00 1.845e+00 3.492 0.000480 ***
## MOSTYPE40 -1.124e+01 6.541e+02 -0.017 0.986294
## MOSTYPE41 5.267e+00 1.901e+00 2.771 0.005594 **
## MGODRK1 7.030e-01 1.213e-01 5.794 6.89e-09 ***
## MGODRK2 9.424e-01 1.280e-01 7.361 1.82e-13 ***
## MGODRK3 -1.204e+00 2.944e-01 -4.090 4.31e-05 ***
## MGODRK4 -1.305e+00 5.893e-01 -2.215 0.026752 *
## MGODRK5 1.979e+00 6.371e-01 3.107 0.001889 **
## MGODRK6 2.481e+00 8.038e-01 3.087 0.002023 **
## MGODRK7 -1.502e+01 2.169e+03 -0.007 0.994474
## MGODRK8 -1.270e+01 2.986e+03 -0.004 0.996607
## MGODRK9 -1.226e+01 2.478e+03 -0.005 0.996053
## MGODPR1 6.325e-01 6.817e-01 0.928 0.353544
## MGODPR2 2.728e+00 6.276e-01 4.346 1.39e-05 ***
## MGODPR3 2.965e+00 6.131e-01 4.837 1.32e-06 ***
## MGODPR4 3.146e+00 6.143e-01 5.120 3.05e-07 ***
## MGODPR5 3.436e+00 6.136e-01 5.600 2.14e-08 ***
## MGODPR6 2.709e+00 6.028e-01 4.494 6.99e-06 ***
## MGODPR7 4.102e+00 6.214e-01 6.602 4.06e-11 ***
## MGODPR8 3.069e+00 7.053e-01 4.350 1.36e-05 ***
## MGODPR9 4.140e+00 6.635e-01 6.240 4.37e-10 ***
## MGODOV1 -4.422e-01 1.180e-01 -3.747 0.000179 ***
## MGODOV2 7.438e-02 1.191e-01 0.624 0.532382
## MGODOV3 6.231e-01 1.981e-01 3.146 0.001655 **
## MGODOV4 -5.756e-01 3.211e-01 -1.793 0.073001 .
## MGODOV5 2.210e+00 7.124e-01 3.102 0.001924 **
## MRELSA1 -2.780e-02 1.109e-01 -0.251 0.802101
## MRELSA2 1.148e-01 1.269e-01 0.905 0.365661
## MRELSA3 9.149e-01 2.658e-01 3.442 0.000578 ***
## MRELSA4 -2.655e+00 5.598e-01 -4.742 2.11e-06 ***
## MRELSA5 -1.712e+01 1.311e+03 -0.013 0.989582
## MRELSA6 -1.621e+01 1.504e+03 -0.011 0.991401
## MRELSA7 -1.523e+01 6.523e+03 -0.002 0.998137
## MRELOV1 -8.892e-02 1.844e-01 -0.482 0.629654
## MRELOV2 -2.407e-01 1.469e-01 -1.639 0.101290
## MRELOV3 -4.754e-01 1.651e-01 -2.880 0.003978 **
## MRELOV4 -8.582e-01 2.157e-01 -3.979 6.93e-05 ***
## MRELOV5 -1.502e+00 3.307e-01 -4.544 5.53e-06 ***
## MRELOV6 -2.860e+00 4.556e-01 -6.277 3.46e-10 ***
## MRELOV7 -3.892e+00 6.326e-01 -6.152 7.66e-10 ***
## MRELOV8 -1.509e+01 7.680e+02 -0.020 0.984328
## MRELOV9 1.960e+01 5.898e+02 0.033 0.973486
## MFALLEEN1 -1.122e-01 1.474e-01 -0.761 0.446783
## MFALLEEN2 -5.165e-02 1.974e-01 -0.262 0.793590
## MFALLEEN3 8.372e-02 2.728e-01 0.307 0.758928
## MFALLEEN4 4.383e-01 3.851e-01 1.138 0.255027
## MFALLEEN5 -2.181e-01 5.167e-01 -0.422 0.672883
## MFALLEEN6 1.376e+00 6.638e-01 2.074 0.038108 *
## MFALLEEN7 -1.730e+01 5.898e+02 -0.029 0.976601
## MFALLEEN8 2.637e+00 1.032e+00 2.555 0.010607 *
## MFALLEEN9 -1.491e+01 8.832e+02 -0.017 0.986530
## MFGEKIND1 1.281e+00 3.478e-01 3.682 0.000231 ***
## MFGEKIND2 6.275e-01 3.649e-01 1.720 0.085485 .
## MFGEKIND3 6.645e-01 4.014e-01 1.655 0.097874 .
## MFGEKIND4 8.021e-01 4.518e-01 1.776 0.075804 .
## MFGEKIND5 2.664e-01 5.305e-01 0.502 0.615605
## MFGEKIND6 -9.850e-01 6.095e-01 -1.616 0.106034
## MFGEKIND7 5.140e-01 7.529e-01 0.683 0.494852
## MFGEKIND8 4.297e+00 1.197e+00 3.588 0.000333 ***
## MFGEKIND9 -1.831e+00 1.440e+00 -1.271 0.203708
## MFWEKIND1 -1.813e+00 4.651e-01 -3.898 9.68e-05 ***
## MFWEKIND2 -1.139e+00 4.463e-01 -2.552 0.010697 *
## MFWEKIND3 -1.699e+00 4.849e-01 -3.504 0.000458 ***
## MFWEKIND4 -2.050e+00 5.426e-01 -3.778 0.000158 ***
## MFWEKIND5 -2.600e+00 5.951e-01 -4.370 1.24e-05 ***
## MFWEKIND6 -1.978e+00 6.704e-01 -2.950 0.003178 **
## MFWEKIND7 -2.432e+00 7.479e-01 -3.252 0.001147 **
## MFWEKIND8 -1.951e+00 8.512e-01 -2.292 0.021911 *
## MFWEKIND9 -1.151e+00 9.157e-01 -1.257 0.208580
## MOPLHOOG1 7.477e-01 1.366e-01 5.473 4.41e-08 ***
## MOPLHOOG2 1.074e-01 1.827e-01 0.588 0.556613
## MOPLHOOG3 -3.639e-01 2.811e-01 -1.295 0.195469
## MOPLHOOG4 3.225e-01 3.627e-01 0.889 0.373927
## MOPLHOOG5 6.917e-01 4.627e-01 1.495 0.134942
## MOPLHOOG6 -2.124e+00 6.704e-01 -3.169 0.001530 **
## MOPLHOOG7 -3.700e-01 7.939e-01 -0.466 0.641120
## MOPLHOOG8 -3.174e-01 8.979e-01 -0.354 0.723708
## MOPLHOOG9 -1.018e+00 1.223e+00 -0.832 0.405170
## MOPLMIDD1 -4.894e-01 4.190e-01 -1.168 0.242807
## MOPLMIDD2 -6.603e-01 3.925e-01 -1.682 0.092505 .
## MOPLMIDD3 -1.215e+00 4.317e-01 -2.815 0.004874 **
## MOPLMIDD4 -1.758e+00 4.737e-01 -3.711 0.000206 ***
## MOPLMIDD5 -1.594e+00 5.379e-01 -2.963 0.003043 **
## MOPLMIDD6 -1.622e+00 6.281e-01 -2.583 0.009805 **
## MOPLMIDD7 -1.781e+00 7.521e-01 -2.367 0.017914 *
## MOPLMIDD8 -2.042e+00 8.919e-01 -2.289 0.022058 *
## MOPLMIDD9 -2.408e+00 9.724e-01 -2.476 0.013290 *
## MOPLLAAG1 -1.520e+00 3.958e-01 -3.840 0.000123 ***
## MOPLLAAG2 -9.887e-01 3.880e-01 -2.548 0.010831 *
## MOPLLAAG3 -1.092e+00 4.230e-01 -2.581 0.009840 **
## MOPLLAAG4 -2.117e+00 4.860e-01 -4.356 1.32e-05 ***
## MOPLLAAG5 -2.370e+00 5.546e-01 -4.273 1.93e-05 ***
## MOPLLAAG6 -2.533e+00 6.277e-01 -4.036 5.44e-05 ***
## MOPLLAAG7 -3.256e+00 7.309e-01 -4.455 8.38e-06 ***
## MOPLLAAG8 -2.833e+00 8.593e-01 -3.297 0.000979 ***
## MOPLLAAG9 -5.633e+00 9.488e-01 -5.937 2.90e-09 ***
## MBERHOOG1 -4.298e-01 1.486e-01 -2.891 0.003836 **
## MBERHOOG2 5.324e-01 1.664e-01 3.199 0.001379 **
## MBERHOOG3 2.995e-01 2.233e-01 1.341 0.179828
## MBERHOOG4 1.576e+00 2.915e-01 5.408 6.37e-08 ***
## MBERHOOG5 9.862e-01 3.906e-01 2.525 0.011565 *
## MBERHOOG6 3.688e+00 4.841e-01 7.617 2.59e-14 ***
## MBERHOOG7 3.776e+00 5.992e-01 6.301 2.95e-10 ***
## MBERHOOG8 7.435e+00 9.388e-01 7.919 2.39e-15 ***
## MBERHOOG9 3.098e+00 8.571e-01 3.615 0.000301 ***
## MBERZELF1 6.746e-01 1.221e-01 5.523 3.33e-08 ***
## MBERZELF2 7.308e-01 1.897e-01 3.852 0.000117 ***
## MBERZELF3 9.332e-01 5.111e-01 1.826 0.067883 .
## MBERZELF4 2.150e+00 8.451e-01 2.544 0.010945 *
## MBERZELF5 1.407e+00 8.222e-01 1.711 0.087010 .
## MBERBOER1 7.390e-01 1.245e-01 5.936 2.92e-09 ***
## MBERBOER2 1.394e+00 1.996e-01 6.987 2.82e-12 ***
## MBERBOER3 3.230e+00 3.737e-01 8.643 < 2e-16 ***
## MBERBOER4 9.550e-01 6.170e-01 1.548 0.121691
## MBERBOER5 4.783e+00 7.229e-01 6.616 3.68e-11 ***
## MBERBOER6 -1.047e+01 1.167e+03 -0.009 0.992841
## MBERBOER7 4.427e+00 3.197e+03 0.001 0.998895
## MBERBOER8 1.236e+01 2.543e+03 0.005 0.996120
## MBERBOER9 9.109e+00 3.305e+03 0.003 0.997801
## MBERMIDD1 1.312e-02 2.716e-01 0.048 0.961475
## MBERMIDD2 1.028e+00 2.376e-01 4.325 1.52e-05 ***
## MBERMIDD3 1.011e+00 2.759e-01 3.662 0.000250 ***
## MBERMIDD4 2.199e+00 3.237e-01 6.792 1.11e-11 ***
## MBERMIDD5 2.071e+00 3.911e-01 5.295 1.19e-07 ***
## MBERMIDD6 3.543e+00 4.748e-01 7.462 8.53e-14 ***
## MBERMIDD7 3.913e+00 5.822e-01 6.721 1.81e-11 ***
## MBERMIDD8 -1.933e+01 1.273e+03 -0.015 0.987880
## MBERMIDD9 5.097e+00 7.254e-01 7.027 2.11e-12 ***
## MBERARBG1 1.485e+00 1.644e-01 9.031 < 2e-16 ***
## MBERARBG2 1.259e+00 1.915e-01 6.576 4.84e-11 ***
## MBERARBG3 1.034e+00 2.427e-01 4.262 2.03e-05 ***
## MBERARBG4 1.634e+00 3.052e-01 5.353 8.64e-08 ***
## MBERARBG5 2.963e+00 3.883e-01 7.630 2.35e-14 ***
## MBERARBG6 1.114e+00 4.935e-01 2.258 0.023951 *
## MBERARBG7 5.194e+00 6.536e-01 7.947 1.90e-15 ***
## MBERARBG8 4.121e+00 7.117e-01 5.790 7.05e-09 ***
## MBERARBG9 6.042e+00 9.366e-01 6.451 1.11e-10 ***
## MBERARBO1 7.554e-03 1.651e-01 0.046 0.963514
## MBERARBO2 2.503e-01 1.861e-01 1.345 0.178733
## MBERARBO3 7.295e-01 2.297e-01 3.175 0.001497 **
## MBERARBO4 1.779e+00 2.906e-01 6.121 9.28e-10 ***
## MBERARBO5 2.164e+00 3.927e-01 5.512 3.56e-08 ***
## MBERARBO6 1.532e+00 4.869e-01 3.147 0.001648 **
## MBERARBO7 3.781e+00 6.914e-01 5.470 4.51e-08 ***
## MBERARBO8 4.360e+00 1.182e+00 3.689 0.000225 ***
## MBERARBO9 -1.079e+01 1.045e+03 -0.010 0.991759
## MSKB11 2.628e-01 1.514e-01 1.736 0.082483 .
## MSKB12 -5.124e-01 1.563e-01 -3.278 0.001046 **
## MSKB13 -1.925e-01 1.842e-01 -1.046 0.295783
## MSKB14 -1.041e+00 2.654e-01 -3.924 8.72e-05 ***
## MSKB15 -4.193e-01 3.899e-01 -1.076 0.282112
## MSKB16 9.354e-01 4.931e-01 1.897 0.057835 .
## MSKB17 -1.747e+01 1.994e+03 -0.009 0.993008
## MSKB18 1.399e+00 8.087e-01 1.730 0.083718 .
## MSKB19 -1.970e+01 1.324e+03 -0.015 0.988123
## MSKB21 -6.648e-01 1.806e-01 -3.681 0.000232 ***
## MSKB22 -4.053e-01 1.660e-01 -2.441 0.014650 *
## MSKB23 -8.808e-01 1.941e-01 -4.538 5.67e-06 ***
## MSKB24 -3.325e-01 2.132e-01 -1.560 0.118866
## MSKB25 -7.836e-01 2.779e-01 -2.820 0.004798 **
## MSKB26 -9.965e-01 4.783e-01 -2.084 0.037204 *
## MSKB27 -1.734e+01 1.992e+03 -0.009 0.993055
## MSKB28 -2.199e+01 2.096e+03 -0.010 0.991631
## MSKB29 9.281e+00 5.601e+03 0.002 0.998678
## MSKC1 1.686e+00 3.841e-01 4.390 1.13e-05 ***
## MSKC2 1.744e+00 3.646e-01 4.782 1.74e-06 ***
## MSKC3 1.453e+00 3.654e-01 3.976 7.00e-05 ***
## MSKC4 1.712e+00 3.879e-01 4.414 1.02e-05 ***
## MSKC5 1.325e+00 4.119e-01 3.216 0.001301 **
## MSKC6 1.681e+00 4.554e-01 3.692 0.000223 ***
## MSKC7 2.239e+00 5.240e-01 4.273 1.93e-05 ***
## MSKC8 2.832e+00 5.861e-01 4.832 1.35e-06 ***
## MSKC9 1.080e+00 6.355e-01 1.700 0.089166 .
## MSKD1 -1.564e-01 1.173e-01 -1.333 0.182407
## MSKD2 -2.059e-01 1.449e-01 -1.421 0.155353
## MSKD3 -8.769e-01 2.247e-01 -3.903 9.51e-05 ***
## MSKD4 -4.739e-01 3.559e-01 -1.331 0.183091
## MSKD5 -3.516e+00 9.437e-01 -3.726 0.000195 ***
## MSKD6 -1.777e+01 5.875e+02 -0.030 0.975875
## MSKD7 -5.522e-01 1.196e+00 -0.462 0.644364
## MSKD9 -1.417e+01 6.523e+03 -0.002 0.998267
## MHKOOP1 -9.229e-01 2.222e-01 -4.153 3.28e-05 ***
## MHKOOP2 -1.565e+00 2.048e-01 -7.642 2.14e-14 ***
## MHKOOP3 -7.291e-01 2.027e-01 -3.596 0.000323 ***
## MHKOOP4 -1.166e+00 2.074e-01 -5.622 1.89e-08 ***
## MHKOOP5 -9.826e-01 1.964e-01 -5.002 5.68e-07 ***
## MHKOOP6 -4.430e-01 2.006e-01 -2.208 0.027225 *
## MHKOOP7 -1.256e+00 1.945e-01 -6.455 1.08e-10 ***
## MHKOOP8 -8.889e-01 2.118e-01 -4.197 2.71e-05 ***
## MHKOOP9 -3.542e-01 1.975e-01 -1.794 0.072837 .
## MAUT11 7.182e+00 7.930e+03 0.001 0.999277
## MAUT12 1.142e+01 4.302e+03 0.003 0.997882
## MAUT13 8.272e+00 4.302e+03 0.002 0.998466
## MAUT14 1.189e+01 4.302e+03 0.003 0.997796
## MAUT15 1.245e+01 4.302e+03 0.003 0.997690
## MAUT16 1.220e+01 4.302e+03 0.003 0.997737
## MAUT17 1.205e+01 4.302e+03 0.003 0.997766
## MAUT18 1.186e+01 4.302e+03 0.003 0.997801
## MAUT19 1.061e+01 4.302e+03 0.002 0.998032
## MAUT21 2.048e-01 1.438e-01 1.424 0.154311
## MAUT22 -5.603e-01 2.131e-01 -2.629 0.008562 **
## MAUT23 -1.414e+00 3.424e-01 -4.130 3.63e-05 ***
## MAUT24 -6.115e-01 4.561e-01 -1.341 0.180018
## MAUT25 -1.521e+00 7.340e-01 -2.072 0.038304 *
## MAUT26 1.689e+00 1.412e+00 1.196 0.231670
## MAUT27 4.722e+00 6.600e+03 0.001 0.999429
## MAUT01 -7.409e-01 1.867e-01 -3.969 7.23e-05 ***
## MAUT02 -4.470e-01 2.235e-01 -2.000 0.045509 *
## MAUT03 -4.565e-01 3.369e-01 -1.355 0.175423
## MAUT04 -2.240e+00 4.569e-01 -4.904 9.41e-07 ***
## MAUT05 -1.108e+00 6.313e-01 -1.755 0.079331 .
## MAUT06 3.528e+00 1.027e+00 3.437 0.000589 ***
## MAUT07 -4.590e+01 1.104e+03 -0.042 0.966821
## MAUT08 -1.159e+01 6.750e+03 -0.002 0.998630
## MAUT09 -7.937e+00 4.820e+03 -0.002 0.998686
## MZFONDS1 -1.793e+01 1.253e+03 -0.014 0.988580
## MZFONDS2 4.261e-01 4.114e-01 1.036 0.300278
## MZFONDS3 -1.769e-01 4.294e-01 -0.412 0.680374
## MZFONDS4 -7.074e-02 4.214e-01 -0.168 0.866684
## MZFONDS5 2.833e-01 4.234e-01 0.669 0.503475
## MZFONDS6 -4.051e-01 4.246e-01 -0.954 0.339967
## MZFONDS7 5.477e-01 4.287e-01 1.278 0.201385
## MZFONDS8 1.010e+00 4.474e-01 2.258 0.023939 *
## MZFONDS9 6.235e-01 4.498e-01 1.386 0.165690
## MINKM301 7.072e-02 1.597e-01 0.443 0.657930
## MINKM302 2.935e-01 1.794e-01 1.636 0.101887
## MINKM303 -1.269e+00 2.504e-01 -5.070 3.98e-07 ***
## MINKM304 -1.904e+00 3.356e-01 -5.673 1.41e-08 ***
## MINKM305 -1.644e+00 4.212e-01 -3.903 9.51e-05 ***
## MINKM306 -1.134e+00 5.111e-01 -2.219 0.026490 *
## MINKM307 -2.092e+00 6.618e-01 -3.162 0.001569 **
## MINKM308 -3.000e+00 8.073e-01 -3.716 0.000202 ***
## MINKM309 -1.956e+01 5.913e+02 -0.033 0.973619
## MINK30451 -1.095e-01 3.015e-01 -0.363 0.716430
## MINK30452 -4.677e-01 2.642e-01 -1.770 0.076758 .
## MINK30453 -9.844e-01 3.121e-01 -3.154 0.001611 **
## MINK30454 -1.510e+00 3.603e-01 -4.192 2.77e-05 ***
## MINK30455 -2.055e+00 4.456e-01 -4.612 3.98e-06 ***
## MINK30456 -2.658e+00 5.315e-01 -5.001 5.69e-07 ***
## MINK30457 -2.673e+00 6.216e-01 -4.300 1.71e-05 ***
## MINK30458 -2.864e+00 8.028e-01 -3.567 0.000361 ***
## MINK30459 -8.936e-01 7.864e-01 -1.136 0.255802
## MINK45751 -4.089e-01 2.135e-01 -1.915 0.055510 .
## MINK45752 -4.238e-01 2.128e-01 -1.992 0.046401 *
## MINK45753 -3.178e-01 2.649e-01 -1.200 0.230241
## MINK45754 -1.026e+00 3.321e-01 -3.089 0.002007 **
## MINK45755 -1.049e+00 4.309e-01 -2.434 0.014932 *
## MINK45756 -1.605e+00 5.160e-01 -3.110 0.001869 **
## MINK45757 -2.377e+00 6.419e-01 -3.703 0.000213 ***
## MINK45758 -3.514e+00 7.744e-01 -4.538 5.68e-06 ***
## MINK45759 -3.860e+00 8.202e-01 -4.706 2.53e-06 ***
## MINK75121 4.807e-01 1.154e-01 4.166 3.10e-05 ***
## MINK75122 -9.441e-02 1.663e-01 -0.568 0.570276
## MINK75123 -1.022e+00 2.698e-01 -3.788 0.000152 ***
## MINK75124 -6.879e-01 3.651e-01 -1.884 0.059519 .
## MINK75125 -2.128e+00 5.301e-01 -4.015 5.95e-05 ***
## MINK75126 -1.935e+01 1.649e+03 -0.012 0.990637
## MINK75127 -2.080e+01 6.523e+03 -0.003 0.997455
## MINK75128 -2.180e+01 2.372e+03 -0.009 0.992667
## MINK75129 3.470e+00 1.257e+00 2.760 0.005785 **
## MINK123M1 -5.495e-01 1.419e-01 -3.873 0.000107 ***
## MINK123M2 -8.495e-01 3.338e-01 -2.545 0.010937 *
## MINK123M3 -3.197e+00 7.446e-01 -4.294 1.76e-05 ***
## MINK123M4 -2.144e+01 1.008e+03 -0.021 0.983027
## MINK123M5 -9.483e-02 6.656e+03 0.000 0.999989
## MINK123M7 -2.423e+01 6.523e+03 -0.004 0.997036
## MINK123M9 3.211e-01 6.656e+03 0.000 0.999962
## MKOOPKLA2 1.356e+00 7.787e-01 1.742 0.081587 .
## MKOOPKLA3 -1.643e+01 1.353e+03 -0.012 0.990314
## MKOOPKLA4 -1.568e+01 1.353e+03 -0.012 0.990755
## MKOOPKLA5 -1.533e+01 1.353e+03 -0.011 0.990958
## MKOOPKLA6 -9.943e+00 1.353e+03 -0.007 0.994136
## MKOOPKLA7 -1.038e+01 1.353e+03 -0.008 0.993878
## MKOOPKLA8 -1.029e+01 1.353e+03 -0.008 0.993934
## PWABEDR1 -1.643e+01 2.052e+03 -0.008 0.993612
## PWABEDR2 1.593e+00 4.875e-01 3.268 0.001084 **
## PWABEDR3 9.570e-01 5.578e-01 1.716 0.086222 .
## PWABEDR4 -1.897e+01 1.226e+03 -0.015 0.987658
## PWABEDR5 1.111e+01 9.541e+03 0.001 0.999071
## PWABEDR6 -1.651e+01 3.233e+03 -0.005 0.995924
## PWALAND2 -1.751e+01 3.522e+03 -0.005 0.996033
## PWALAND3 -1.839e+00 4.915e-01 -3.742 0.000183 ***
## PWALAND4 -1.821e+00 5.563e-01 -3.274 0.001062 **
## PPERSAUT4 -1.883e+01 6.523e+03 -0.003 0.997696
## PPERSAUT5 -5.477e-02 1.280e-01 -0.428 0.668716
## PPERSAUT6 2.005e+00 7.165e-02 27.988 < 2e-16 ***
## PPERSAUT7 -1.781e+01 7.944e+02 -0.022 0.982112
## PPERSAUT8 -1.768e+01 4.600e+03 -0.004 0.996933
## PVRAAUT4 -1.357e+00 6.637e+03 0.000 0.999837
## PVRAAUT6 -1.802e+01 2.439e+03 -0.007 0.994104
## PVRAAUT9 2.729e+00 6.571e+03 0.000 0.999669
## PAANHANG1 4.933e-02 5.405e-01 0.091 0.927272
## PAANHANG2 1.501e-01 4.553e-01 0.330 0.741550
## PAANHANG3 -1.662e+01 2.450e+03 -0.007 0.994588
## PAANHANG4 -2.016e+01 6.523e+03 -0.003 0.997534
## PAANHANG5 -3.358e+00 6.964e+03 0.000 0.999615
## PWERKT2 -1.529e+01 2.280e+03 -0.007 0.994650
## PWERKT3 -1.576e+01 2.035e+03 -0.008 0.993818
## PWERKT4 -1.795e+01 1.855e+03 -0.010 0.992279
## PWERKT6 -1.209e+01 6.523e+03 -0.002 0.998521
## PPERSONG1 -1.880e+01 3.291e+03 -0.006 0.995441
## PPERSONG2 -3.514e+00 9.973e-01 -3.523 0.000426 ***
## PPERSONG3 -2.013e+01 2.338e+03 -0.009 0.993131
## PPERSONG4 -1.632e+01 3.084e+03 -0.005 0.995778
## PPERSONG5 -1.422e+01 6.523e+03 -0.002 0.998261
## PPERSONG6 -1.586e+01 3.671e+03 -0.004 0.996553
## PWAOREG4 -1.906e+01 6.523e+03 -0.003 0.997669
## PWAOREG5 -1.233e+01 6.523e+03 -0.002 0.998492
## PWAOREG6 3.777e+00 4.798e-01 7.872 3.50e-15 ***
## PWAOREG7 -2.153e+01 5.589e+03 -0.004 0.996927
## PPLEZIER1 6.305e+00 1.595e+00 3.952 7.75e-05 ***
## PPLEZIER2 9.672e-01 1.521e+00 0.636 0.524861
## PPLEZIER3 4.478e+00 1.488e+00 3.008 0.002626 **
## PPLEZIER4 1.757e+00 1.253e+00 1.402 0.160955
## PPLEZIER5 -1.444e+01 3.623e+03 -0.004 0.996819
## PPLEZIER6 4.053e+01 8.550e+02 0.047 0.962195
## AWAPART1 3.762e-01 8.209e-02 4.583 4.58e-06 ***
## AWAPART2 -1.798e+01 1.920e+03 -0.009 0.992528
## ABESAUT1 1.006e+00 4.200e-01 2.395 0.016608 *
## ABESAUT2 -1.675e+01 2.618e+03 -0.006 0.994894
## ABESAUT3 -1.392e+01 2.721e+03 -0.005 0.995918
## ABESAUT4 4.541e+01 1.011e+04 0.004 0.996415
## AMOTSCO1 5.853e-02 1.575e-01 0.372 0.710210
## AMOTSCO2 9.850e-03 5.264e-01 0.019 0.985072
## AMOTSCO8 -1.800e+01 6.523e+03 -0.003 0.997798
## ATRACTOR1 -7.002e-01 3.318e-01 -2.110 0.034858 *
## ATRACTOR2 2.363e-01 6.651e-01 0.355 0.722347
## ATRACTOR3 -1.566e+01 2.178e+03 -0.007 0.994264
## ATRACTOR4 -1.434e+01 2.010e+03 -0.007 0.994307
## ABROM1 -7.609e-01 1.628e-01 -4.673 2.97e-06 ***
## ABROM2 -1.724e+01 1.472e+03 -0.012 0.990660
## ALEVEN1 -9.627e-01 1.964e-01 -4.902 9.46e-07 ***
## ALEVEN2 -1.692e-01 2.089e-01 -0.810 0.417797
## ALEVEN3 1.454e-02 4.856e-01 0.030 0.976111
## ALEVEN4 2.092e+00 6.509e-01 3.214 0.001308 **
## ALEVEN8 1.717e+01 8.003e+03 0.002 0.998288
## AGEZONG1 3.194e-01 3.252e-01 0.982 0.325952
## ABRAND1 5.744e-01 8.376e-02 6.858 6.98e-12 ***
## ABRAND2 7.562e-02 2.310e-01 0.327 0.743435
## ABRAND3 -1.917e+01 1.682e+03 -0.011 0.990904
## ABRAND4 -1.776e+01 2.707e+03 -0.007 0.994765
## ABRAND5 -8.689e+00 2.752e+03 -0.003 0.997481
## ABRAND7 -1.703e+01 6.523e+03 -0.003 0.997917
## AZEILPL1 -1.844e+01 1.833e+03 -0.010 0.991975
## APLEZIER1 8.712e-02 1.341e+00 0.065 0.948190
## APLEZIER2 NA NA NA NA
## AFIETS1 1.462e+00 2.241e-01 6.525 6.80e-11 ***
## AFIETS2 2.062e-01 3.536e-01 0.583 0.559904
## AFIETS3 1.601e+00 1.384e+00 1.157 0.247467
## AINBOED1 -4.217e-01 3.385e-01 -1.246 0.212785
## AINBOED2 -2.158e+01 6.523e+03 -0.003 0.997360
## ABYSTAND1 1.042e+00 2.132e-01 4.887 1.02e-06 ***
## ABYSTAND2 -1.837e+01 6.523e+03 -0.003 0.997752
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15177.2 on 10947 degrees of freedom
## Residual deviance: 8889.3 on 10559 degrees of freedom
## AIC: 9667.3
##
## Number of Fisher Scoring iterations: 17
# difference between null deviance. = 15177 - 12386 = 2791
# step(step.wise1, direction = "backwards")
Here we have applied GLM to our 56 variables, because it must be remembered that our dataset is categorical in nature. After having run the regression we have see that most of the variables are significant. This means that the dimension reduction we took earlier is helping us to come up with a good model
Now we went one step further and applied a step-wise backwards regression which further helped us reduce variables from 56 to 41 variables with error in null deviance to 2783
In the next step we will take the 41 variables which we got from step-wise
for(i in 1:ncol(train_2)){
train_2[,i] <- as.factor(train_2[,i])
}
model.2 = glm(formula = CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV +
MRELGE + MRELSA + MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF +
MBERBOER + MBERMIDD + MBERARBG + MBERARBO + MSKC + MSKD +
MHKOOP + MAUT1 + MAUT2 + MAUT0 + MINK3045 + MINK7512 + MINK123M +
MKOOPKLA + PPERSAUT + PMOTSCO + PVRAAUT + PAANHANG + PWERKT +
PWAOREG + PPLEZIER + AWAPART + AWALAND + ABROM + ALEVEN +
APERSONG + AGEZONG + ABRAND + APLEZIER + AFIETS + ABYSTAND,
family = binomial(link = "logit"), data = over_train)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model.2$xlevels[["MSKD"]] <- union(model.2$xlevels[["MSKD"]], levels(over_test$MSKD))
model.2$xlevels[["MAUT2"]] <- union(model.2$xlevels[["MAUT2"]], levels(over_test$MAUT2))
model.2$xlevels[["MINK123M"]] <- union(model.2$xlevels[["MINK123M"]], levels(over_test$MINK123M))
model.2$xlevels[["PPERSAUT"]] <- union(model.2$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
model.2$xlevels[["PVRAAUT"]] <- union(model.2$xlevels[["PVRAAUT"]], levels(over_test$PVRAAUT))
model.2$xlevels[["PWERKT"]] <- union(model.2$xlevels[["PWERKT"]], levels(over_test$PWERKT))
model.2$xlevels[["ABROM"]] <- union(model.2$xlevels[["ABROM"]], levels(over_test$ABROM))
model.2$xlevels[["ALEVEN"]] <- union(model.2$xlevels[["ALEVEN"]], levels(over_test$ALEVEN))
model.2$xlevels[["ABRAND"]] <- union(model.2$xlevels[["ABRAND"]], levels(over_test$ABRAND))
model.2$xlevels[["AFIETS"]] <- union(model.2$xlevels[["AFIETS"]], levels(over_test$AFIETS))
drewSummary(model.2)
##
## Call:
## glm(formula = CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV +
## MRELGE + MRELSA + MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF +
## MBERBOER + MBERMIDD + MBERARBG + MBERARBO + MSKC + MSKD +
## MHKOOP + MAUT1 + MAUT2 + MAUT0 + MINK3045 + MINK7512 + MINK123M +
## MKOOPKLA + PPERSAUT + PMOTSCO + PVRAAUT + PAANHANG + PWERKT +
## PWAOREG + PPLEZIER + AWAPART + AWALAND + ABROM + ALEVEN +
## APERSONG + AGEZONG + ABRAND + APLEZIER + AFIETS + ABYSTAND,
## family = binomial(link = "logit"), data = over_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3299 -0.6490 0.0000 0.7778 1.9839
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.087e+00 4.635e+03 -0.001 0.998952
## MOSTYPE2 -8.090e-01 7.424e-01 -1.090 0.275839
## MOSTYPE3 -3.755e-01 6.901e-01 -0.544 0.586295
## MOSTYPE4 -1.662e+00 7.409e-01 -2.243 0.024883 *
## MOSTYPE5 1.506e+00 1.613e+00 0.933 0.350609
## MOSTYPE6 -7.457e-01 2.646e-01 -2.818 0.004828 **
## MOSTYPE7 -1.701e+00 7.313e-01 -2.325 0.020050 *
## MOSTYPE8 5.591e-01 4.710e-01 1.187 0.235210
## MOSTYPE9 1.350e+00 1.553e+00 0.869 0.384893
## MOSTYPE10 -6.468e-01 2.673e-01 -2.419 0.015546 *
## MOSTYPE11 -4.601e-01 6.821e-01 -0.675 0.499944
## MOSTYPE12 1.705e+00 4.805e-01 3.548 0.000388 ***
## MOSTYPE13 -1.136e+00 6.870e-01 -1.653 0.098272 .
## MOSTYPE15 -2.856e+01 3.101e+03 -0.009 0.992653
## MOSTYPE16 -2.899e+01 1.713e+03 -0.017 0.986495
## MOSTYPE17 -1.918e+01 1.886e+03 -0.010 0.991885
## MOSTYPE18 -3.102e+01 1.749e+03 -0.018 0.985849
## MOSTYPE19 -1.763e+01 3.117e+03 -0.006 0.995486
## MOSTYPE20 -1.406e+01 1.335e+03 -0.011 0.991597
## MOSTYPE21 -2.940e+01 1.945e+03 -0.015 0.987938
## MOSTYPE22 -1.450e+01 1.335e+03 -0.011 0.991330
## MOSTYPE23 9.836e-01 1.602e+00 0.614 0.539349
## MOSTYPE24 -1.435e+01 1.335e+03 -0.011 0.991420
## MOSTYPE25 -1.466e+01 1.335e+03 -0.011 0.991234
## MOSTYPE26 -1.339e+01 1.335e+03 -0.010 0.991995
## MOSTYPE27 -1.553e+01 1.335e+03 -0.012 0.990718
## MOSTYPE28 -3.207e+01 1.667e+03 -0.019 0.984653
## MOSTYPE29 1.286e+00 1.621e+00 0.793 0.427564
## MOSTYPE30 -1.362e+01 1.335e+03 -0.010 0.991858
## MOSTYPE31 -1.379e+01 1.335e+03 -0.010 0.991754
## MOSTYPE32 -1.340e+01 1.335e+03 -0.010 0.991991
## MOSTYPE33 2.375e+00 1.591e+00 1.493 0.135507
## MOSTYPE34 -4.549e-01 7.010e-01 -0.649 0.516387
## MOSTYPE35 7.191e-01 1.517e+00 0.474 0.635566
## MOSTYPE36 3.007e+00 1.596e+00 1.884 0.059612 .
## MOSTYPE37 2.472e+00 1.557e+00 1.588 0.112277
## MOSTYPE38 2.255e+00 1.551e+00 1.454 0.145964
## MOSTYPE39 1.555e+00 1.509e+00 1.031 0.302768
## MOSTYPE40 -1.687e+01 6.559e+02 -0.026 0.979484
## MOSTYPE41 8.434e-01 1.565e+00 0.539 0.589972
## MGODRK1 3.333e-01 9.363e-02 3.560 0.000371 ***
## MGODRK2 3.908e-01 1.028e-01 3.800 0.000145 ***
## MGODRK3 -8.136e-01 2.342e-01 -3.474 0.000512 ***
## MGODRK4 -1.401e+00 5.401e-01 -2.594 0.009483 **
## MGODRK5 1.709e+00 5.439e-01 3.142 0.001678 **
## MGODRK6 7.815e-01 6.353e-01 1.230 0.218655
## MGODRK7 -1.571e+01 2.372e+03 -0.007 0.994716
## MGODRK8 -1.468e+01 3.218e+03 -0.005 0.996361
## MGODRK9 -1.490e+01 2.503e+03 -0.006 0.995251
## MGODPR1 6.560e-01 5.705e-01 1.150 0.250167
## MGODPR2 2.539e+00 5.123e-01 4.956 7.20e-07 ***
## MGODPR3 2.541e+00 5.065e-01 5.016 5.27e-07 ***
## MGODPR4 2.833e+00 5.057e-01 5.603 2.11e-08 ***
## MGODPR5 2.737e+00 5.032e-01 5.440 5.32e-08 ***
## MGODPR6 2.412e+00 5.027e-01 4.798 1.60e-06 ***
## MGODPR7 3.350e+00 5.117e-01 6.546 5.90e-11 ***
## MGODPR8 2.761e+00 5.898e-01 4.681 2.85e-06 ***
## MGODPR9 2.629e+00 5.366e-01 4.899 9.63e-07 ***
## MGODOV1 -4.952e-01 9.676e-02 -5.118 3.09e-07 ***
## MGODOV2 3.286e-03 9.521e-02 0.035 0.972471
## MGODOV3 3.302e-01 1.662e-01 1.987 0.046945 *
## MGODOV4 -8.363e-01 2.562e-01 -3.264 0.001098 **
## MGODOV5 1.790e+00 5.261e-01 3.403 0.000666 ***
## MRELGE1 -1.843e+01 4.200e+02 -0.044 0.965005
## MRELGE2 -2.574e+00 5.918e-01 -4.350 1.36e-05 ***
## MRELGE3 -9.259e-01 5.304e-01 -1.746 0.080873 .
## MRELGE4 -1.254e+00 5.151e-01 -2.434 0.014929 *
## MRELGE5 -1.101e+00 5.007e-01 -2.199 0.027902 *
## MRELGE6 -1.389e+00 5.079e-01 -2.734 0.006259 **
## MRELGE7 -9.151e-01 5.043e-01 -1.815 0.069593 .
## MRELGE8 -8.566e-01 5.225e-01 -1.639 0.101112
## MRELGE9 -8.156e-01 5.074e-01 -1.608 0.107941
## MRELSA1 -1.420e-02 9.495e-02 -0.150 0.881096
## MRELSA2 -4.716e-03 1.130e-01 -0.042 0.966707
## MRELSA3 7.580e-01 2.391e-01 3.170 0.001522 **
## MRELSA4 -1.907e+00 5.138e-01 -3.712 0.000206 ***
## MRELSA5 -1.758e+01 1.243e+03 -0.014 0.988718
## MRELSA6 -1.867e+01 1.548e+03 -0.012 0.990376
## MRELSA7 -8.653e-01 6.536e+03 0.000 0.999894
## MOPLMIDD1 -9.922e-02 3.135e-01 -0.316 0.751641
## MOPLMIDD2 -6.028e-01 2.685e-01 -2.245 0.024769 *
## MOPLMIDD3 -9.626e-01 2.598e-01 -3.706 0.000211 ***
## MOPLMIDD4 -1.279e+00 2.625e-01 -4.873 1.10e-06 ***
## MOPLMIDD5 -1.180e+00 2.732e-01 -4.319 1.57e-05 ***
## MOPLMIDD6 -1.360e+00 3.014e-01 -4.513 6.38e-06 ***
## MOPLMIDD7 -1.377e+00 3.375e-01 -4.079 4.52e-05 ***
## MOPLMIDD8 -2.117e+00 4.896e-01 -4.324 1.53e-05 ***
## MOPLMIDD9 -2.567e+00 4.502e-01 -5.702 1.19e-08 ***
## MOPLLAAG1 -9.185e-01 2.866e-01 -3.205 0.001353 **
## MOPLLAAG2 -6.587e-01 2.497e-01 -2.638 0.008348 **
## MOPLLAAG3 -1.006e+00 2.485e-01 -4.048 5.18e-05 ***
## MOPLLAAG4 -1.719e+00 2.584e-01 -6.654 2.84e-11 ***
## MOPLLAAG5 -2.011e+00 2.761e-01 -7.285 3.22e-13 ***
## MOPLLAAG6 -2.161e+00 3.003e-01 -7.197 6.17e-13 ***
## MOPLLAAG7 -2.417e+00 3.378e-01 -7.154 8.43e-13 ***
## MOPLLAAG8 -2.461e+00 4.146e-01 -5.934 2.95e-09 ***
## MOPLLAAG9 -4.682e+00 4.547e-01 -10.296 < 2e-16 ***
## MBERHOOG1 -1.462e-01 1.170e-01 -1.250 0.211457
## MBERHOOG2 6.360e-01 1.321e-01 4.814 1.48e-06 ***
## MBERHOOG3 2.081e-01 1.833e-01 1.136 0.256058
## MBERHOOG4 9.875e-01 2.373e-01 4.161 3.17e-05 ***
## MBERHOOG5 6.289e-01 3.184e-01 1.975 0.048292 *
## MBERHOOG6 2.216e+00 3.834e-01 5.780 7.45e-09 ***
## MBERHOOG7 2.762e+00 4.777e-01 5.783 7.34e-09 ***
## MBERHOOG8 4.752e+00 6.885e-01 6.902 5.13e-12 ***
## MBERHOOG9 1.365e+00 6.605e-01 2.067 0.038721 *
## MBERZELF1 4.653e-01 1.014e-01 4.590 4.44e-06 ***
## MBERZELF2 5.454e-01 1.493e-01 3.653 0.000260 ***
## MBERZELF3 1.164e+00 4.264e-01 2.730 0.006329 **
## MBERZELF4 6.743e-01 7.072e-01 0.954 0.340320
## MBERZELF5 3.057e-01 4.992e-01 0.612 0.540329
## MBERBOER1 2.768e-01 1.028e-01 2.691 0.007123 **
## MBERBOER2 4.612e-01 1.643e-01 2.807 0.004999 **
## MBERBOER3 1.462e+00 2.885e-01 5.066 4.07e-07 ***
## MBERBOER4 -9.216e-02 4.646e-01 -0.198 0.842771
## MBERBOER5 3.112e+00 5.968e-01 5.215 1.84e-07 ***
## MBERBOER6 -1.218e+01 1.225e+03 -0.010 0.992067
## MBERBOER7 4.020e+00 3.613e+03 0.001 0.999112
## MBERBOER8 8.866e+00 2.635e+03 0.003 0.997316
## MBERBOER9 6.482e+00 3.287e+03 0.002 0.998427
## MBERMIDD1 -6.315e-01 2.034e-01 -3.105 0.001904 **
## MBERMIDD2 8.740e-02 1.753e-01 0.499 0.618005
## MBERMIDD3 9.349e-02 2.084e-01 0.449 0.653698
## MBERMIDD4 8.949e-01 2.523e-01 3.546 0.000391 ***
## MBERMIDD5 9.266e-01 3.150e-01 2.942 0.003265 **
## MBERMIDD6 1.623e+00 3.723e-01 4.360 1.30e-05 ***
## MBERMIDD7 1.763e+00 4.596e-01 3.835 0.000125 ***
## MBERMIDD8 -1.934e+01 1.300e+03 -0.015 0.988126
## MBERMIDD9 3.016e+00 5.639e-01 5.348 8.89e-08 ***
## MBERARBG1 1.160e+00 1.362e-01 8.516 < 2e-16 ***
## MBERARBG2 1.032e+00 1.588e-01 6.496 8.27e-11 ***
## MBERARBG3 6.719e-01 2.060e-01 3.261 0.001109 **
## MBERARBG4 1.046e+00 2.559e-01 4.086 4.39e-05 ***
## MBERARBG5 2.016e+00 3.275e-01 6.154 7.57e-10 ***
## MBERARBG6 5.985e-01 4.186e-01 1.430 0.152808
## MBERARBG7 4.111e+00 5.750e-01 7.150 8.67e-13 ***
## MBERARBG8 3.113e+00 6.222e-01 5.004 5.61e-07 ***
## MBERARBG9 4.942e+00 8.327e-01 5.934 2.95e-09 ***
## MBERARBO1 -3.294e-02 1.316e-01 -0.250 0.802362
## MBERARBO2 1.613e-01 1.498e-01 1.076 0.281743
## MBERARBO3 4.333e-01 1.865e-01 2.323 0.020179 *
## MBERARBO4 1.291e+00 2.363e-01 5.465 4.62e-08 ***
## MBERARBO5 1.468e+00 3.195e-01 4.594 4.35e-06 ***
## MBERARBO6 1.256e+00 4.096e-01 3.066 0.002170 **
## MBERARBO7 1.941e+00 6.203e-01 3.129 0.001753 **
## MBERARBO8 3.977e+00 8.972e-01 4.433 9.28e-06 ***
## MBERARBO9 -1.585e+01 9.171e+02 -0.017 0.986212
## MSKC1 5.737e-01 2.724e-01 2.106 0.035208 *
## MSKC2 5.813e-01 2.379e-01 2.443 0.014548 *
## MSKC3 5.408e-01 2.340e-01 2.311 0.020811 *
## MSKC4 8.697e-01 2.478e-01 3.509 0.000449 ***
## MSKC5 8.780e-01 2.537e-01 3.461 0.000539 ***
## MSKC6 1.444e+00 2.841e-01 5.082 3.74e-07 ***
## MSKC7 1.322e+00 3.159e-01 4.185 2.85e-05 ***
## MSKC8 2.530e+00 3.998e-01 6.327 2.49e-10 ***
## MSKC9 9.878e-02 4.500e-01 0.220 0.826247
## MSKD1 8.764e-02 9.576e-02 0.915 0.360110
## MSKD2 -4.269e-02 1.146e-01 -0.372 0.709546
## MSKD3 -4.204e-01 1.661e-01 -2.531 0.011376 *
## MSKD4 -6.902e-01 2.617e-01 -2.638 0.008341 **
## MSKD5 -1.923e+00 6.259e-01 -3.072 0.002128 **
## MSKD6 -1.597e+01 5.245e+02 -0.030 0.975704
## MSKD7 3.037e+00 1.172e+00 2.591 0.009560 **
## MSKD9 -1.412e+01 6.523e+03 -0.002 0.998273
## MHKOOP1 -5.018e-01 1.799e-01 -2.789 0.005288 **
## MHKOOP2 -1.101e+00 1.719e-01 -6.408 1.47e-10 ***
## MHKOOP3 -3.033e-01 1.637e-01 -1.853 0.063837 .
## MHKOOP4 -3.893e-01 1.646e-01 -2.366 0.017996 *
## MHKOOP5 -3.748e-01 1.586e-01 -2.362 0.018159 *
## MHKOOP6 -8.647e-02 1.572e-01 -0.550 0.582292
## MHKOOP7 -6.817e-01 1.521e-01 -4.482 7.40e-06 ***
## MHKOOP8 -3.731e-01 1.651e-01 -2.260 0.023796 *
## MHKOOP9 -9.742e-02 1.516e-01 -0.643 0.520548
## MAUT11 9.743e+00 8.002e+03 0.001 0.999029
## MAUT12 1.279e+01 4.439e+03 0.003 0.997702
## MAUT13 1.253e+01 4.439e+03 0.003 0.997748
## MAUT14 1.516e+01 4.439e+03 0.003 0.997275
## MAUT15 1.581e+01 4.439e+03 0.004 0.997158
## MAUT16 1.592e+01 4.439e+03 0.004 0.997138
## MAUT17 1.625e+01 4.439e+03 0.004 0.997079
## MAUT18 1.646e+01 4.439e+03 0.004 0.997041
## MAUT19 1.575e+01 4.439e+03 0.004 0.997169
## MAUT21 2.152e-01 1.170e-01 1.840 0.065820 .
## MAUT22 -1.704e-01 1.624e-01 -1.049 0.294169
## MAUT23 -3.760e-01 2.579e-01 -1.458 0.144865
## MAUT24 4.427e-01 3.441e-01 1.287 0.198221
## MAUT25 1.446e-01 5.945e-01 0.243 0.807762
## MAUT26 3.485e+00 1.163e+00 2.996 0.002733 **
## MAUT27 2.589e+00 6.617e+03 0.000 0.999688
## MAUT01 -3.413e-02 1.432e-01 -0.238 0.811668
## MAUT02 3.263e-01 1.663e-01 1.962 0.049757 *
## MAUT03 7.060e-01 2.521e-01 2.800 0.005106 **
## MAUT04 -9.803e-01 3.538e-01 -2.771 0.005590 **
## MAUT05 6.194e-01 5.090e-01 1.217 0.223618
## MAUT06 3.628e+00 8.819e-01 4.114 3.89e-05 ***
## MAUT07 -5.926e+01 1.117e+03 -0.053 0.957700
## MAUT08 -5.818e+00 6.760e+03 -0.001 0.999313
## MAUT09 -3.893e+00 4.591e+03 -0.001 0.999323
## MINK30451 -1.238e-01 2.049e-01 -0.604 0.545630
## MINK30452 2.546e-01 1.592e-01 1.599 0.109756
## MINK30453 1.471e-01 1.530e-01 0.961 0.336555
## MINK30454 -3.928e-02 1.552e-01 -0.253 0.800180
## MINK30455 -1.862e-01 1.646e-01 -1.131 0.257924
## MINK30456 -2.170e-01 1.809e-01 -1.200 0.230295
## MINK30457 1.232e-01 2.224e-01 0.554 0.579541
## MINK30458 7.610e-02 3.992e-01 0.191 0.848798
## MINK30459 1.589e+00 2.904e-01 5.470 4.50e-08 ***
## MINK75121 5.293e-01 9.215e-02 5.744 9.24e-09 ***
## MINK75122 1.609e-01 1.082e-01 1.488 0.136831
## MINK75123 1.830e-01 1.545e-01 1.184 0.236218
## MINK75124 7.571e-01 1.994e-01 3.798 0.000146 ***
## MINK75125 -2.893e-01 2.855e-01 -1.013 0.310921
## MINK75126 -1.808e+01 1.777e+03 -0.010 0.991882
## MINK75127 -2.192e+01 6.523e+03 -0.003 0.997318
## MINK75128 -2.041e+01 2.392e+03 -0.009 0.993194
## MINK75129 3.738e+00 8.986e-01 4.160 3.19e-05 ***
## MINK123M1 -2.771e-01 1.092e-01 -2.537 0.011172 *
## MINK123M2 8.420e-02 2.388e-01 0.353 0.724437
## MINK123M3 -1.565e+00 4.367e-01 -3.585 0.000338 ***
## MINK123M4 -1.856e+01 1.116e+03 -0.017 0.986727
## MINK123M5 -1.938e+01 6.523e+03 -0.003 0.997629
## MINK123M7 -2.319e+01 6.523e+03 -0.004 0.997164
## MINK123M9 -1.710e+01 6.523e+03 -0.003 0.997908
## MKOOPKLA2 2.495e-01 6.925e-01 0.360 0.718609
## MKOOPKLA3 -1.578e+01 1.335e+03 -0.012 0.990567
## MKOOPKLA4 -1.520e+01 1.335e+03 -0.011 0.990911
## MKOOPKLA5 -1.471e+01 1.335e+03 -0.011 0.991206
## MKOOPKLA6 -1.297e+01 1.335e+03 -0.010 0.992249
## MKOOPKLA7 -1.400e+01 1.335e+03 -0.010 0.991630
## MKOOPKLA8 -1.396e+01 1.335e+03 -0.010 0.991652
## PPERSAUT4 -1.816e+01 6.523e+03 -0.003 0.997779
## PPERSAUT5 -1.540e-01 1.167e-01 -1.320 0.186823
## PPERSAUT6 1.822e+00 6.365e-02 28.627 < 2e-16 ***
## PPERSAUT7 -1.769e+01 8.132e+02 -0.022 0.982642
## PPERSAUT8 -1.666e+01 4.201e+03 -0.004 0.996835
## PMOTSCO3 3.417e+00 1.510e+00 2.263 0.023619 *
## PMOTSCO4 -1.211e-01 1.781e-01 -0.680 0.496465
## PMOTSCO5 3.240e-01 3.044e-01 1.064 0.287210
## PMOTSCO6 -2.791e+00 5.439e-01 -5.131 2.89e-07 ***
## PMOTSCO7 -1.912e+01 4.285e+03 -0.004 0.996439
## PVRAAUT4 -1.905e+01 6.523e+03 -0.003 0.997669
## PVRAAUT6 -1.727e+01 2.121e+03 -0.008 0.993503
## PVRAAUT9 4.590e-01 6.573e+03 0.000 0.999944
## PAANHANG1 8.150e-01 5.014e-01 1.626 0.104050
## PAANHANG2 1.093e+00 3.780e-01 2.891 0.003843 **
## PAANHANG3 -1.636e+01 2.227e+03 -0.007 0.994140
## PAANHANG4 -1.873e+01 6.523e+03 -0.003 0.997709
## PAANHANG5 -2.436e+00 6.859e+03 0.000 0.999717
## PWERKT2 -1.614e+01 2.301e+03 -0.007 0.994404
## PWERKT3 -1.641e+01 2.063e+03 -0.008 0.993656
## PWERKT4 -1.512e+01 1.888e+03 -0.008 0.993613
## PWERKT6 -1.385e+01 3.068e+03 -0.005 0.996398
## PWAOREG4 -1.878e+01 6.523e+03 -0.003 0.997703
## PWAOREG5 -1.633e+01 6.523e+03 -0.003 0.998002
## PWAOREG6 3.056e+00 4.099e-01 7.455 8.98e-14 ***
## PWAOREG7 -1.866e+01 3.214e+03 -0.006 0.995368
## PPLEZIER1 5.346e+00 1.540e+00 3.472 0.000517 ***
## PPLEZIER2 3.251e-01 1.424e+00 0.228 0.819386
## PPLEZIER3 2.489e+00 1.419e+00 1.754 0.079478 .
## PPLEZIER4 1.229e+00 1.177e+00 1.044 0.296664
## PPLEZIER5 -1.619e+01 3.925e+03 -0.004 0.996709
## PPLEZIER6 5.117e+01 8.605e+02 0.059 0.952586
## AWAPART1 3.934e-01 7.312e-02 5.380 7.44e-08 ***
## AWAPART2 -1.807e+01 2.063e+03 -0.009 0.993012
## AWALAND1 -1.516e+00 2.676e-01 -5.666 1.46e-08 ***
## ABROM1 -7.671e-01 1.497e-01 -5.125 2.97e-07 ***
## ABROM2 -1.739e+01 1.539e+03 -0.011 0.990984
## ALEVEN1 -9.233e-01 1.783e-01 -5.179 2.23e-07 ***
## ALEVEN2 1.606e-03 1.833e-01 0.009 0.993006
## ALEVEN3 1.952e-01 4.673e-01 0.418 0.676084
## ALEVEN4 1.807e+00 6.341e-01 2.849 0.004382 **
## ALEVEN8 -8.667e-01 7.758e+03 0.000 0.999911
## APERSONG1 -4.124e+00 9.580e-01 -4.305 1.67e-05 ***
## AGEZONG1 3.443e-01 3.000e-01 1.148 0.251029
## ABRAND1 3.499e-01 7.426e-02 4.711 2.46e-06 ***
## ABRAND2 1.019e-01 2.022e-01 0.504 0.614426
## ABRAND3 -1.891e+01 1.779e+03 -0.011 0.991520
## ABRAND4 -1.711e+01 2.978e+03 -0.006 0.995414
## ABRAND5 -1.265e+01 3.790e+03 -0.003 0.997337
## ABRAND7 -1.600e+01 6.523e+03 -0.002 0.998042
## APLEZIER1 1.038e+00 1.269e+00 0.817 0.413744
## APLEZIER2 NA NA NA NA
## AFIETS1 1.080e+00 1.956e-01 5.523 3.34e-08 ***
## AFIETS2 4.689e-01 3.066e-01 1.529 0.126164
## AFIETS3 2.962e+00 1.169e+00 2.532 0.011331 *
## ABYSTAND1 9.915e-01 2.000e-01 4.956 7.18e-07 ***
## ABYSTAND2 -1.855e+01 6.523e+03 -0.003 0.997731
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15177 on 10947 degrees of freedom
## Residual deviance: 9723 on 10662 degrees of freedom
## AIC: 10295
##
## Number of Fisher Scoring iterations: 17
drewMatrix(model.2, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2132 101
## 1 1630 137
##
## Accuracy : 0.5672
## 95% CI : (0.5517, 0.5827)
## No Information Rate : 0.9405
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0355
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.57563
## Specificity : 0.56672
## Pos Pred Value : 0.07753
## Neg Pred Value : 0.95477
## Prevalence : 0.05950
## Detection Rate : 0.03425
## Detection Prevalence : 0.44175
## Balanced Accuracy : 0.57118
##
## 'Positive' Class : 1
##
# difference in deviance = Null deviance (15177.2) - 9766.3 = 5411
# Sensitivity : 64%
# Accuracy : 54%
As we have seen that most of the variables are significant, let’s say MGODRK1 is increased by 1 unit then we can say that the customer is likely to buy caravan insurance. The likelihood of buying insurance is .34.
Applying the GLM from the step-wise model(backwards), we see we have produced a good model. The reason as to why we consider it as an improved model is because we have relatively improved our accuracy and also reduced our variables. Now if we want to further improve our model, we can change our cut-off value to get better results, which in turn will increase specificity. Although the classifications is 0.46 but an average the model is producing good results.
Now will create another model which would be our 3rd model. This model is based on our domain knowledge which we have collected by reading a few articles and deciding the factors that play the most important role in determining whether the customer will buy an insurance or not. We spent a good amount of time adding and removing variables to come up with this model, which we assume would be a better one. To make sure it is, we have verified it as well.
corrplot(cor(subset(df_train , select = c("PBRAND", "MOSTYPE", "PPERSAUT", "MKOOPKLA", "MHKOOP", "CARAVAN"))), method = "number", type = "upper")
Simply put, we ran a corerelation matrix among the variables of interest
based on our domain knowledge and achieved the following matrix
train_3 = over_train
for(i in 1:ncol(train_3)){
train_3[,i] <- as.factor(train_3[,i])
}
model.3 = glm(formula = CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA+MHKOOP, family = binomial(link = "logit"),
data = train_3)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model.3$xlevels[["PPERSAUT"]] <- union(model.3$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
predicted_3 = predict(model.3, over_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictedClass_3 = ifelse(predicted_3>=0.5, 1, 0)
drewSummary(model.3)
##
## Call:
## glm(formula = CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA +
## MHKOOP, family = binomial(link = "logit"), data = train_3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1689 -0.8847 0.2209 0.9066 2.0789
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 14.84840 531.91996 0.028 0.977730
## PBRAND1 -0.69741 0.20121 -3.466 0.000528 ***
## PBRAND2 -1.10821 0.12976 -8.541 < 2e-16 ***
## PBRAND3 0.70967 0.06415 11.062 < 2e-16 ***
## PBRAND4 0.72419 0.05625 12.875 < 2e-16 ***
## PBRAND5 -0.16518 0.14691 -1.124 0.260876
## PBRAND6 -0.59417 0.18178 -3.269 0.001081 **
## PBRAND7 -16.25358 655.68587 -0.025 0.980224
## PBRAND8 -16.51102 2399.54472 -0.007 0.994510
## MOSTYPE2 -2.48726 0.54586 -4.557 5.20e-06 ***
## MOSTYPE3 -2.44264 0.52563 -4.647 3.37e-06 ***
## MOSTYPE4 -3.38937 0.58376 -5.806 6.39e-09 ***
## MOSTYPE5 -0.86187 1.15298 -0.748 0.454753
## MOSTYPE6 -0.31446 0.18655 -1.686 0.091857 .
## MOSTYPE7 -2.93594 0.56682 -5.180 2.22e-07 ***
## MOSTYPE8 -0.38850 0.35911 -1.082 0.279316
## MOSTYPE9 -2.06787 1.09704 -1.885 0.059437 .
## MOSTYPE10 -0.97191 0.19183 -5.067 4.05e-07 ***
## MOSTYPE11 -2.91971 0.52146 -5.599 2.16e-08 ***
## MOSTYPE12 -0.39390 0.36022 -1.094 0.274172
## MOSTYPE13 -2.27062 0.52318 -4.340 1.42e-05 ***
## MOSTYPE15 -31.43989 1149.29323 -0.027 0.978176
## MOSTYPE16 -32.34378 737.64043 -0.044 0.965026
## MOSTYPE17 -18.27821 724.48272 -0.025 0.979872
## MOSTYPE18 -32.03103 719.26840 -0.045 0.964480
## MOSTYPE19 -17.66798 1224.71101 -0.014 0.988490
## MOSTYPE20 -15.79842 531.91971 -0.030 0.976306
## MOSTYPE21 -32.06790 779.25911 -0.041 0.967175
## MOSTYPE22 -16.88569 531.91970 -0.032 0.974676
## MOSTYPE23 -1.51402 1.12995 -1.340 0.180277
## MOSTYPE24 -16.89003 531.91971 -0.032 0.974669
## MOSTYPE25 -16.27280 531.91996 -0.031 0.975594
## MOSTYPE26 -17.07803 531.92006 -0.032 0.974387
## MOSTYPE27 -16.36495 531.92004 -0.031 0.975456
## MOSTYPE28 -32.07183 685.79372 -0.047 0.962700
## MOSTYPE29 -1.80197 1.14436 -1.575 0.115335
## MOSTYPE30 -17.30764 531.91973 -0.033 0.974043
## MOSTYPE31 -16.86894 531.91997 -0.032 0.974701
## MOSTYPE32 -15.60633 531.91996 -0.029 0.976594
## MOSTYPE33 -0.99299 1.12335 -0.884 0.376719
## MOSTYPE34 -2.87946 0.53378 -5.395 6.87e-08 ***
## MOSTYPE35 -3.00097 1.06319 -2.823 0.004763 **
## MOSTYPE36 -0.37743 1.12784 -0.335 0.737890
## MOSTYPE37 -1.46596 1.09355 -1.341 0.180068
## MOSTYPE38 -1.31019 1.09405 -1.198 0.231089
## MOSTYPE39 -2.20077 1.06283 -2.071 0.038390 *
## MOSTYPE40 -17.22686 254.62643 -0.068 0.946060
## MOSTYPE41 -2.66862 1.10137 -2.423 0.015393 *
## PPERSAUT4 -15.52662 2399.54472 -0.006 0.994837
## PPERSAUT5 -0.06279 0.09359 -0.671 0.502287
## PPERSAUT6 1.57751 0.05024 31.398 < 2e-16 ***
## PPERSAUT7 -15.57298 333.14133 -0.047 0.962716
## PPERSAUT8 -15.12060 1126.54079 -0.013 0.989291
## MKOOPKLA2 0.39402 0.51798 0.761 0.446843
## MKOOPKLA3 -15.51669 531.91877 -0.029 0.976728
## MKOOPKLA4 -14.43144 531.91883 -0.027 0.978355
## MKOOPKLA5 -13.93127 531.91889 -0.026 0.979105
## MKOOPKLA6 -13.52861 531.91969 -0.025 0.979709
## MKOOPKLA7 -15.16099 531.91983 -0.029 0.977261
## MKOOPKLA8 -15.99471 531.91993 -0.030 0.976011
## MHKOOP1 -0.11577 0.11516 -1.005 0.314793
## MHKOOP2 -0.29700 0.11981 -2.479 0.013179 *
## MHKOOP3 0.38327 0.11613 3.301 0.000965 ***
## MHKOOP4 0.19255 0.10873 1.771 0.076579 .
## MHKOOP5 -0.01002 0.10802 -0.093 0.926102
## MHKOOP6 0.48161 0.10298 4.677 2.91e-06 ***
## MHKOOP7 0.13166 0.10172 1.294 0.195534
## MHKOOP8 0.65145 0.11310 5.760 8.42e-09 ***
## MHKOOP9 0.52022 0.09889 5.261 1.43e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15177 on 10947 degrees of freedom
## Residual deviance: 11912 on 10879 degrees of freedom
## AIC: 12050
##
## Number of Fisher Scoring iterations: 15
drewMatrix(model.3, over_test)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2350 84
## 1 1412 154
##
## Accuracy : 0.626
## 95% CI : (0.6108, 0.641)
## No Information Rate : 0.9405
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0752
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.64706
## Specificity : 0.62467
## Pos Pred Value : 0.09834
## Neg Pred Value : 0.96549
## Prevalence : 0.05950
## Detection Rate : 0.03850
## Detection Prevalence : 0.39150
## Balanced Accuracy : 0.63586
##
## 'Positive' Class : 1
##
getRMSE(predictedClass_3)
## [1] 0.842615
# Area under curve
pr <- prediction(predictedClass_3,over_test$CARAVAN)
perf <- performance(pr,measure = "tpr",x.measure = "fpr")
plot(perf) > auc(over_test$CARAVAN,predictedClass_3)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## logical(0)
auc_ROCR <- performance(pr, measure = "auc")
auc_ROCR <- auc_ROCR@y.values[[1]]
pR2(model.3)['McFadden']
## fitting null model for pseudo-r2
## McFadden
## 0.2151129
# difference in deviance = Null deviance (15177) - 12051 = 3025
# sensivity 72%
# Accuracy : 60%
For this model, we created a confusion matrix along with ROC. We see that the accuracy is approximately 60%, while sensitivity is also 60% and specificity is roughly 60% as well.
AUC ROC plot logistic regression Our AUC score is 0.59. This means that, we are relatively closer to 1 and we know that the closer we are to 1 the better.
pred_t <- predict(model.3, na.action=na.pass)
hist(pred_t)
boxplot(pred_t)
##Plotting residual histograms for training and validation data
resid.t<-residuals(model.3)
hist(resid.t)
From the above histogram we can see the range of our residuals is
between -2 and 2.
drewROC(model.3)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
From the above ROC curve we can see that if we compare it with the
previous ROC the area we have achieved a higher area under the curve
with respect tot the ROC.
lift.example <- lift(relevel(as.factor(over_test$CARAVAN), ref="1") ~ predicted_3, data = over_test)
#xyplot(lift.example, plot = "gain")
library(gains)
actual = as.numeric(over_test$CARAVAN)
predicted_3_num = as.numeric(predicted_3)
gain = gains(actual, predicted_3_num)
barplot(gain$mean.resp / mean(actual), names.arg = gain$depth, xlab = "Percentile", ylab = "Mean Response", main = "Decile-wise lift chart")
Since our data is categorical, we need to assess the predictive
performance our models. One method to do so is by looking at the
Decile-Wise chart. The idea behind Decide wise chart is that it divides
the data into 10 bins. Each bin tells us the % by which the model is
able to explain the predictive performance. It must also be kept in mind
that a good decile was chart is sort of downward slopping from left to
right. As we can see in this chart, that we achieved, somewhat a similar
thing. The predictive performance of our models is good because at each
bin we touch about 1% of the probability.
train_4 = over_train
for(i in 1:ncol(train_4)){
train_4[,i] <- as.factor(train_4[,i])
}
fit1 = rpart(formula=CARAVAN ~ .,data=over_train,method = 'class', control=rpart.control(minsplit=20, minbucket=1, cp=0.008))
printcp(fit1)
##
## Classification tree:
## rpart(formula = CARAVAN ~ ., data = over_train, method = "class",
## control = rpart.control(minsplit = 20, minbucket = 1, cp = 0.008))
##
## Variables actually used in tree construction:
## [1] MBERHOOG MBERMIDD MFALLEEN MGODGE MHKOOP MINKGEM MOSTYPE MSKB2
## [9] MSKC PBRAND PPERSAUT
##
## Root node error: 5474/10948 = 0.5
##
## n= 10948
##
## CP nsplit rel error xerror xstd
## 1 0.3818049 0 1.00000 1.02320 0.0095547
## 2 0.0277676 1 0.61820 0.61838 0.0088339
## 3 0.0217391 3 0.56266 0.57033 0.0086301
## 4 0.0105042 4 0.54092 0.55225 0.0085457
## 5 0.0088905 6 0.51991 0.51096 0.0083364
## 6 0.0080000 13 0.44574 0.47077 0.0081091
fancyRpartPlot(fit1)
glm_6 = glm(formula = CARAVAN ~ PPERSAUT + MBERHOOG +MGODPR + MHKOOP +MINKGEM +MINKM30 +MOSTYPE +PBRAND +PBROM, family = binomial(link = "logit"),data = train_4)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
glm_6$xlevels[["PPERSAUT"]] <- union(glm_6$xlevels[["PPERSAUT"]], levels(over_test$PPERSAUT))
summary(glm_6)
##
## Call:
## glm(formula = CARAVAN ~ PPERSAUT + MBERHOOG + MGODPR + MHKOOP +
## MINKGEM + MINKM30 + MOSTYPE + PBRAND + PBROM, family = binomial(link = "logit"),
## data = train_4)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5544 -0.8385 0.1426 0.8595 1.9681
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.979e+01 4.292e+02 -0.046 0.963221
## PPERSAUT4 -1.481e+01 2.400e+03 -0.006 0.995075
## PPERSAUT5 -7.410e-02 9.685e-02 -0.765 0.444243
## PPERSAUT6 1.581e+00 5.307e-02 29.798 < 2e-16 ***
## PPERSAUT7 -1.575e+01 3.249e+02 -0.048 0.961343
## PPERSAUT8 -1.502e+01 1.002e+03 -0.015 0.988038
## MBERHOOG1 1.655e-01 7.862e-02 2.105 0.035328 *
## MBERHOOG2 2.065e-01 7.579e-02 2.725 0.006434 **
## MBERHOOG3 -2.120e-02 9.136e-02 -0.232 0.816502
## MBERHOOG4 2.924e-01 1.115e-01 2.622 0.008740 **
## MBERHOOG5 -4.591e-01 1.452e-01 -3.162 0.001565 **
## MBERHOOG6 8.413e-02 1.579e-01 0.533 0.594258
## MBERHOOG7 9.128e-01 1.900e-01 4.805 1.55e-06 ***
## MBERHOOG8 9.297e-01 3.203e-01 2.903 0.003696 **
## MBERHOOG9 -2.876e-01 3.884e-01 -0.740 0.459136
## MGODPR1 1.473e+00 3.797e-01 3.878 0.000105 ***
## MGODPR2 1.761e+00 3.401e-01 5.178 2.24e-07 ***
## MGODPR3 2.011e+00 3.362e-01 5.982 2.20e-09 ***
## MGODPR4 1.891e+00 3.304e-01 5.721 1.06e-08 ***
## MGODPR5 2.090e+00 3.317e-01 6.302 2.95e-10 ***
## MGODPR6 1.939e+00 3.355e-01 5.781 7.42e-09 ***
## MGODPR7 2.659e+00 3.355e-01 7.926 2.27e-15 ***
## MGODPR8 1.533e+00 4.041e-01 3.794 0.000148 ***
## MGODPR9 1.781e+00 3.616e-01 4.924 8.46e-07 ***
## MHKOOP1 -4.522e-01 1.244e-01 -3.633 0.000280 ***
## MHKOOP2 -3.963e-01 1.273e-01 -3.114 0.001846 **
## MHKOOP3 1.152e-01 1.271e-01 0.906 0.365026
## MHKOOP4 -9.811e-02 1.186e-01 -0.827 0.408147
## MHKOOP5 -4.676e-01 1.177e-01 -3.972 7.13e-05 ***
## MHKOOP6 9.456e-03 1.175e-01 0.080 0.935871
## MHKOOP7 -3.840e-01 1.165e-01 -3.297 0.000976 ***
## MHKOOP8 -1.893e-02 1.279e-01 -0.148 0.882338
## MHKOOP9 3.286e-03 1.178e-01 0.028 0.977742
## MINKGEM1 1.617e+01 4.292e+02 0.038 0.969943
## MINKGEM2 1.628e+01 4.292e+02 0.038 0.969749
## MINKGEM3 1.627e+01 4.292e+02 0.038 0.969762
## MINKGEM4 1.679e+01 4.292e+02 0.039 0.968789
## MINKGEM5 1.687e+01 4.292e+02 0.039 0.968643
## MINKGEM6 1.638e+01 4.292e+02 0.038 0.969554
## MINKGEM7 1.726e+01 4.292e+02 0.040 0.967926
## MINKGEM8 1.727e+01 4.292e+02 0.040 0.967905
## MINKGEM9 6.633e-01 6.125e+02 0.001 0.999136
## MINKM301 3.282e-01 8.967e-02 3.660 0.000252 ***
## MINKM302 5.202e-01 7.935e-02 6.556 5.51e-11 ***
## MINKM303 1.136e-02 9.222e-02 0.123 0.901951
## MINKM304 -3.912e-01 1.174e-01 -3.332 0.000862 ***
## MINKM305 -8.430e-02 1.284e-01 -0.657 0.511387
## MINKM306 3.190e-01 1.777e-01 1.795 0.072658 .
## MINKM307 2.203e-01 2.239e-01 0.984 0.325214
## MINKM308 -4.193e-01 3.681e-01 -1.139 0.254637
## MINKM309 -1.548e+01 2.810e+02 -0.055 0.956084
## MOSTYPE2 2.043e-01 2.478e-01 0.824 0.409801
## MOSTYPE3 2.224e-01 1.868e-01 1.190 0.233871
## MOSTYPE4 -8.216e-01 3.213e-01 -2.557 0.010546 *
## MOSTYPE5 -1.795e-01 3.350e-01 -0.536 0.591997
## MOSTYPE6 -3.494e-01 2.060e-01 -1.696 0.089872 .
## MOSTYPE7 -7.681e-01 3.176e-01 -2.419 0.015573 *
## MOSTYPE8 4.584e-01 1.759e-01 2.606 0.009164 **
## MOSTYPE9 -2.203e-01 2.054e-01 -1.072 0.283581
## MOSTYPE10 -9.310e-01 2.130e-01 -4.371 1.24e-05 ***
## MOSTYPE11 -4.856e-01 2.189e-01 -2.218 0.026521 *
## MOSTYPE12 4.501e-01 2.152e-01 2.092 0.036457 *
## MOSTYPE13 9.188e-02 2.024e-01 0.454 0.649794
## MOSTYPE15 -1.399e+01 1.009e+03 -0.014 0.988943
## MOSTYPE16 -1.521e+01 4.887e+02 -0.031 0.975172
## MOSTYPE17 -1.512e+01 7.191e+02 -0.021 0.983224
## MOSTYPE18 -1.526e+01 4.774e+02 -0.032 0.974500
## MOSTYPE19 -1.606e+01 1.235e+03 -0.013 0.989626
## MOSTYPE20 4.775e-01 3.614e-01 1.321 0.186441
## MOSTYPE21 -1.510e+01 5.837e+02 -0.026 0.979356
## MOSTYPE22 -3.598e-01 2.593e-01 -1.388 0.165273
## MOSTYPE23 -4.672e-01 2.371e-01 -1.970 0.048788 *
## MOSTYPE24 -1.440e-01 2.380e-01 -0.605 0.545264
## MOSTYPE25 1.020e-02 2.837e-01 0.036 0.971320
## MOSTYPE26 -9.598e-01 4.031e-01 -2.381 0.017250 *
## MOSTYPE27 -6.498e-01 3.640e-01 -1.785 0.074251 .
## MOSTYPE28 -1.606e+01 4.178e+02 -0.038 0.969347
## MOSTYPE29 -1.074e+00 3.009e-01 -3.571 0.000356 ***
## MOSTYPE30 -5.886e-01 2.519e-01 -2.336 0.019477 *
## MOSTYPE31 -4.754e-01 2.331e-01 -2.040 0.041351 *
## MOSTYPE32 7.138e-01 2.246e-01 3.177 0.001486 **
## MOSTYPE33 -2.133e-01 1.794e-01 -1.189 0.234558
## MOSTYPE34 -4.996e-01 2.152e-01 -2.321 0.020285 *
## MOSTYPE35 -1.072e+00 2.132e-01 -5.029 4.92e-07 ***
## MOSTYPE36 3.005e-01 2.019e-01 1.488 0.136693
## MOSTYPE37 2.169e-01 2.198e-01 0.987 0.323688
## MOSTYPE38 6.371e-01 1.942e-01 3.281 0.001035 **
## MOSTYPE39 7.777e-03 1.942e-01 0.040 0.968054
## MOSTYPE40 -1.626e+01 2.480e+02 -0.066 0.947712
## MOSTYPE41 -1.270e+00 2.312e-01 -5.492 3.97e-08 ***
## PBRAND1 -6.867e-01 2.087e-01 -3.291 0.000999 ***
## PBRAND2 -1.142e+00 1.325e-01 -8.617 < 2e-16 ***
## PBRAND3 7.107e-01 6.666e-02 10.661 < 2e-16 ***
## PBRAND4 7.790e-01 5.966e-02 13.058 < 2e-16 ***
## PBRAND5 -9.026e-02 1.555e-01 -0.581 0.561494
## PBRAND6 -6.508e-01 1.900e-01 -3.425 0.000614 ***
## PBRAND7 -1.614e+01 6.320e+02 -0.026 0.979630
## PBRAND8 -1.597e+01 2.400e+03 -0.007 0.994691
## PBROM2 -8.912e-01 3.930e-01 -2.267 0.023364 *
## PBROM3 -8.011e-01 1.447e-01 -5.536 3.10e-08 ***
## PBROM4 -1.527e+01 2.698e+02 -0.057 0.954862
## PBROM5 -2.445e-01 5.191e-01 -0.471 0.637675
## PBROM6 -1.592e+01 2.400e+03 -0.007 0.994707
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 15177 on 10947 degrees of freedom
## Residual deviance: 11274 on 10845 degrees of freedom
## AIC: 11480
##
## Number of Fisher Scoring iterations: 15
predicted_6 = predict(glm_6, over_test, type = "response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
predictedClass_6 = ifelse(predicted_6>=0.5, 1, 0)
confusionMatrix(as.factor(predictedClass_6), as.factor(over_test$CARAVAN), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2514 101
## 1 1248 137
##
## Accuracy : 0.6628
## 95% CI : (0.6479, 0.6774)
## No Information Rate : 0.9405
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0749
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.57563
## Specificity : 0.66826
## Pos Pred Value : 0.09892
## Neg Pred Value : 0.96138
## Prevalence : 0.05950
## Detection Rate : 0.03425
## Detection Prevalence : 0.34625
## Balanced Accuracy : 0.62195
##
## 'Positive' Class : 1
##
accuracy(predictedClass_6, as.numeric(over_test$CARAVAN))
## ME RMSE MAE MPE MAPE
## Test set 0.71325 0.8739279 0.71325 67.0875 67.0875
pR2(glm_6)['McFadden']
## fitting null model for pseudo-r2
## McFadden
## 0.2571451
# difference in deviance = Null deviance (15177) - 12069 = 3108
# Accuracy 68%
# Sensitivity 58%
Now we ran another model based on decision tree. The good thing about decision tree is that it gives us variables of most importance having a a greater impact on our response variable. From the decision tree we are able to see that 5 variables are of most importance. The above model is ran at cut-off .5 and we achieved an accuracy of 68%, sensitivity of 58% and specificity of 68%.
However, we ran the model with cut-off at .4 as well. We observed that while accuracy decreased, specificity increased.
drewROC(glm_6)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
anova(logit.reg,model.2,test = 'Chisq')
## Analysis of Deviance Table
##
## Model 1: CARAVAN ~ MOSHOOFD + MGEMOMV + OneHouse + MINKGEM_c + MGEMLEEF_c
## Model 2: CARAVAN ~ MOSTYPE + MGODRK + MGODPR + MGODOV + MRELGE + MRELSA +
## MOPLMIDD + MOPLLAAG + MBERHOOG + MBERZELF + MBERBOER + MBERMIDD +
## MBERARBG + MBERARBO + MSKC + MSKD + MHKOOP + MAUT1 + MAUT2 +
## MAUT0 + MINK3045 + MINK7512 + MINK123M + MKOOPKLA + PPERSAUT +
## PMOTSCO + PVRAAUT + PAANHANG + PWERKT + PWAOREG + PPLEZIER +
## AWAPART + AWALAND + ABROM + ALEVEN + APERSONG + AGEZONG +
## ABRAND + APLEZIER + AFIETS + ABYSTAND
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 10931 14363
## 2 10662 9723 269 4640.4 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova(model.3,glm_6,test = 'Chisq')
## Analysis of Deviance Table
##
## Model 1: CARAVAN ~ PBRAND + MOSTYPE + PPERSAUT + MKOOPKLA + MHKOOP
## Model 2: CARAVAN ~ PPERSAUT + MBERHOOG + MGODPR + MHKOOP + MINKGEM + MINKM30 +
## MOSTYPE + PBRAND + PBROM
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 10879 11912
## 2 10845 11274 34 637.93 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1